library(readr)
Wine <- read.csv("Wine.csv")
The dataframe has 178 Rows by 14 Columns
dim(Wine)
## [1] 178 14
The following variables are listed as numerics/integers:
Alcohol, Malic_Acid, Ash, Ash_Alcalinity, Magnesium,
Total_Phenols,
Flavanoids, Nonflavanoid_Phenols, Proanthocyanins, Color_Intensity,
Hue,
OD280_OD315, Proline
The variable “Type” is a character.
str(Wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : chr "A" "A" "A" "A" ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Ash_Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 NA 3.15 ...
## $ Nonflavanoid_Phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_Intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280_OD315 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
Wine$Type <- as.factor(Wine$Type)
Wine$Alcohol <- as.numeric(Wine$Alcohol)
Wine$Malic_Acid <- as.numeric(Wine$Malic_Acid)
Wine$Ash <- as.numeric(Wine$Ash)
Wine$Ash_Alcalinity <- as.numeric(Wine$Ash_Alcalinity)
Wine$Magnesium <- as.numeric(Wine$Magnesium)
Wine$Total_Phenols <- as.numeric(Wine$Total_Phenols)
Wine$Flavanoids <- as.numeric(Wine$Flavanoids)
Wine$Nonflavanoid_Phenols <- as.numeric(Wine$Nonflavanoid_Phenols)
Wine$Proanthocyanins <- as.numeric(Wine$Proanthocyanins)
Wine$Color_Intensity <- as.numeric(Wine$Color_Intensity)
Wine$Hue <- as.numeric(Wine$Hue)
Wine$OD280_OD315 <- as.numeric(Wine$OD280_OD315)
Wine$Proline <- as.numeric(Wine$Proline)
Check factors for consistency
levels(Wine$Type)
## [1] "A" "B" "C"
The resulting factors identified, “A” “B” “C” are accurate.
Testing for Duplicates
any(duplicated(Wine))
## [1] FALSE
The results of the test indicate there are no duplicates
Test to see if there are any observations missing the Type value
any(is.na(Wine))
## [1] TRUE
colSums(is.na(Wine))
## Type Alcohol Malic_Acid
## 0 0 0
## Ash Ash_Alcalinity Magnesium
## 0 0 0
## Total_Phenols Flavanoids Nonflavanoid_Phenols
## 0 1 0
## Proanthocyanins Color_Intensity Hue
## 1 0 0
## OD280_OD315 Proline
## 0 0
describe(Wine)
## vars n mean sd median trimmed mad min
## Type* 1 178 1.94 0.78 2.00 1.92 1.48 1.00
## Alcohol 2 178 13.00 0.81 13.05 13.01 1.01 11.03
## Malic_Acid 3 178 2.34 1.12 1.87 2.21 0.77 0.74
## Ash 4 178 2.37 0.27 2.36 2.37 0.24 1.36
## Ash_Alcalinity 5 178 19.49 3.34 19.50 19.42 3.04 10.60
## Magnesium 6 178 99.74 14.28 98.00 98.44 14.83 70.00
## Total_Phenols 7 178 2.30 0.63 2.36 2.29 0.75 0.98
## Flavanoids 8 177 2.02 1.00 2.13 2.01 1.23 0.34
## Nonflavanoid_Phenols 9 178 0.36 0.12 0.34 0.36 0.13 0.13
## Proanthocyanins 10 177 1.58 0.57 1.55 1.56 0.55 0.41
## Color_Intensity 11 178 5.06 2.32 4.69 4.83 2.24 1.28
## Hue 12 178 0.96 0.23 0.96 0.96 0.24 0.48
## OD280_OD315 13 178 2.61 0.71 2.78 2.63 0.77 1.27
## Proline 14 178 746.89 314.91 673.50 719.30 300.23 278.00
## max range skew kurtosis se
## Type* 3.00 2.00 0.11 -1.34 0.06
## Alcohol 14.83 3.80 -0.05 -0.89 0.06
## Malic_Acid 5.80 5.06 1.02 0.22 0.08
## Ash 3.23 1.87 -0.17 1.03 0.02
## Ash_Alcalinity 30.00 19.40 0.21 0.40 0.25
## Magnesium 162.00 92.00 1.08 1.96 1.07
## Total_Phenols 3.88 2.90 0.09 -0.87 0.05
## Flavanoids 5.08 4.74 0.04 -0.91 0.08
## Nonflavanoid_Phenols 0.66 0.53 0.44 -0.68 0.01
## Proanthocyanins 3.58 3.17 0.51 0.53 0.04
## Color_Intensity 13.00 11.72 0.85 0.30 0.17
## Hue 1.71 1.23 0.02 -0.40 0.02
## OD280_OD315 4.00 2.73 -0.30 -1.11 0.05
## Proline 1680.00 1402.00 0.75 -0.31 23.60
The results of the test indicate there are no observations missing the Type value
Step 1: Test to see if any data is NA
any(is.na(Wine))
## [1] TRUE
The result shows that this data set does include NA values
Step 2: Test to see which columns have NA values.
colSums(is.na(Wine))
## Type Alcohol Malic_Acid
## 0 0 0
## Ash Ash_Alcalinity Magnesium
## 0 0 0
## Total_Phenols Flavanoids Nonflavanoid_Phenols
## 0 1 0
## Proanthocyanins Color_Intensity Hue
## 1 0 0
## OD280_OD315 Proline
## 0 0
The result shows that the Flavanoids and Proanthocyanins columns each have one NA value
Step 3: Test to see which row has an NA value for Flavanoids
Wine$Flavanoids %>% is.na %>% which
## [1] 9
The result show that row 9 has an NA value
Step 4: Turn NA value into the mean of the column for Flavanoids
Wine$Flavanoids[is.na(Wine$Flavanoids)] <- mean(Wine$Flavanoids,na.rm=TRUE)
Test to see which row has an NA value for Proanthocyanins
Wine$Proanthocyanins %>% is.na %>% which
## [1] 14
Step 5: Turn NA value into the mean of the column for Proanthocyanins
Wine$Proanthocyanins[is.na(Wine$Proanthocyanins)] <- mean(Wine$Proanthocyanins,na.rm=TRUE)
summary(Wine)
## Type Alcohol Malic_Acid Ash Ash_Alcalinity
## A:59 Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## B:71 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## C:48 Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.120 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.024 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.842 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color_Intensity Hue OD280_OD315
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.584 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
high_Magnesium <- ifelse(Wine$Magnesium<100, "low", "high")
high_Magnesium <- as.factor(high_Magnesium)
boxplot(Wine$Malic_Acid, main = 'Malic Acid Boxplot', ylab = 'Malic Acid Level',col='Red')
plot(Wine$Malic_Acid,Wine$Ash, main = 'Malic Acid & Ash Scatterplot', xlab = "Malic Acid", ylab = "Ash")
abline(lm(Ash ~ Malic_Acid,data=Wine),col='red')
plot(Wine$Proline ~ Wine$Type, main = "Proline by Type", xlab = "Type", ylab = "Proline", col='Red' )
Wine_A <- subset(Wine, Wine$Type == "A" & Wine$Magnesium > 100)
Wine_A <- data.frame(Wine_A)
Wine_A[1:5,]
## Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1 A 14.23 1.71 2.43 15.6 127 2.80
## 3 A 13.16 2.36 2.67 18.6 101 2.80
## 4 A 14.37 1.95 2.50 16.8 113 3.85
## 5 A 13.24 2.59 2.87 21.0 118 2.80
## 6 A 14.20 1.76 2.45 15.2 112 3.27
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 3.06 0.28 2.29 5.64 1.04
## 3 3.24 0.30 2.81 5.68 1.03
## 4 3.49 0.24 2.18 7.80 0.86
## 5 2.69 0.39 1.82 4.32 1.04
## 6 3.39 0.34 1.97 6.75 1.05
## OD280_OD315 Proline
## 1 3.92 1065
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
avgAlcByType <- aggregate(Wine$Alcohol ~ Wine$Type, FUN = 'mean')
avgAlcByType
## Wine$Type Wine$Alcohol
## 1 A 13.74475
## 2 B 12.27873
## 3 C 13.15375