Execute the following cell to load the tidyverse library:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.4.0 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.1
## Warning: package 'dplyr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Execute the following cell to load the data. Refer to this website
http://openmv.net/info/food-texture for data
description:
file = 'http://openmv.net/file/food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1)
Print structure of dataframe
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ Oil : num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture: int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness: int 97 139 143 95 143 189 114 63 123 132 ...
Print first 5 samples of data frame
head(foodData, n = 5)
Modify the data frame
# Rename Oil column to OilPercentage
foodData = foodData %>% rename(OilPercentage = Oil)
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
# Modify crispy column to reflect high (0) and low (1) crispness
foodData = foodData %>% mutate(Crispy = ifelse(Crispy > 11, 'high', 'low'))
head(foodData, 5)
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : chr "low" "high" "high" "low" ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
str(lapply(foodData['Crispy'], factor))
## List of 1
## $ Crispy: Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
proglangs = c('R', 'Python', 'Julia', 'Java', 'Erlang')
str(proglangs)
## chr [1:5] "R" "Python" "Julia" "Java" "Erlang"
proglangs_modified = lapply(proglangs, toupper)
str(proglangs_modified)
## List of 5
## $ : chr "R"
## $ : chr "PYTHON"
## $ : chr "JULIA"
## $ : chr "JAVA"
## $ : chr "ERLANG"
# Change Crispy column to factor type
foodData['Crispy'] = lapply(foodData['Crispy'], factor)
marks = c('high', 'low', 'low', 'high', 'high')
str(marks)
## chr [1:5] "high" "low" "low" "high" "high"
marks[1]
## [1] "high"
marks_cat = factor(marks)
str(marks_cat)
## Factor w/ 2 levels "high","low": 1 2 2 1 1
marks_cat[1]
## [1] high
## Levels: high low
Print structure of modified data frame
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
Make a scatter plot between Density (x-axis) and OilPercentage
(y-axis). What do you observe from this plot?
p1 = ggplot(data = foodData, aes(x = Density, y = OilPercentage)) +
geom_point(size = 1)
p1

Make a scatter plot between Density (x-axis) and OilPercentage
(y-axis) color coded using Crispy. What do you observe from this
plot?
p2 = ggplot(data = foodData, aes(x = Density, y = OilPercentage, color = Crispy)) +
geom_point(size = 1)
p2

Calculate Pearson’s correlation between Density and
OilPercentage.
cor(foodData$Density, foodData$OilPercentage, method = 'pearson')
## [1] -0.750024
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
foodData$Density - mean(foodData$Density)
## [1] 97.4 -197.6 12.4 62.4 117.4 -67.6 -107.6 -87.6 97.4 87.4
## [11] -27.6 -22.6 2.4 107.4 72.4 -87.6 -207.6 32.4 -162.6 -102.6
## [21] 142.4 122.4 -77.6 -152.6 -32.6 17.4 87.4 62.4 -12.6 -212.6
## [31] 222.4 -32.6 267.4 -77.6 42.4 -287.6 -222.6 -132.6 7.4 117.4
## [41] 122.4 12.4 62.4 242.4 52.4 7.4 137.4 67.4 -157.6 -12.6
sd(foodData$Density)
## [1] 124.5
(foodData$Density - mean(foodData$Density)) / sd(foodData$Density)
## [1] 0.78232945 -1.58714886 0.09959841 0.50120490 0.94297204 -0.54297198
## [7] -0.86425717 -0.70361457 0.78232945 0.70200815 -0.22168678 -0.18152613
## [13] 0.01927711 0.86265074 0.58152620 -0.70361457 -1.66747015 0.26024101
## [19] -1.30602431 -0.82409652 1.14377529 0.98313269 -0.62329328 -1.22570301
## [25] -0.26184743 0.13975906 0.70200815 0.50120490 -0.10120484 -1.70763080
## [31] 1.78634568 -0.26184743 2.14779152 -0.62329328 0.34056231 -2.31004054
## [37] -1.78795210 -1.06506042 0.05943776 0.94297204 0.98313269 0.09959841
## [43] 0.50120490 1.94698827 0.42088360 0.05943776 1.10361464 0.54136555
## [49] -1.26586366 -0.10120484
scale(foodData$Density)
## [,1]
## [1,] 0.78232945
## [2,] -1.58714886
## [3,] 0.09959841
## [4,] 0.50120490
## [5,] 0.94297204
## [6,] -0.54297198
## [7,] -0.86425717
## [8,] -0.70361457
## [9,] 0.78232945
## [10,] 0.70200815
## [11,] -0.22168678
## [12,] -0.18152613
## [13,] 0.01927711
## [14,] 0.86265074
## [15,] 0.58152620
## [16,] -0.70361457
## [17,] -1.66747015
## [18,] 0.26024101
## [19,] -1.30602431
## [20,] -0.82409652
## [21,] 1.14377529
## [22,] 0.98313269
## [23,] -0.62329328
## [24,] -1.22570301
## [25,] -0.26184743
## [26,] 0.13975906
## [27,] 0.70200815
## [28,] 0.50120490
## [29,] -0.10120484
## [30,] -1.70763080
## [31,] 1.78634568
## [32,] -0.26184743
## [33,] 2.14779152
## [34,] -0.62329328
## [35,] 0.34056231
## [36,] -2.31004054
## [37,] -1.78795210
## [38,] -1.06506042
## [39,] 0.05943776
## [40,] 0.94297204
## [41,] 0.98313269
## [42,] 0.09959841
## [43,] 0.50120490
## [44,] 1.94698827
## [45,] 0.42088360
## [46,] 0.05943776
## [47,] 1.10361464
## [48,] 0.54136555
## [49,] -1.26586366
## [50,] -0.10120484
## attr(,"scaled:center")
## [1] 2857.6
## attr(,"scaled:scale")
## [1] 124.5
proglangs = c('R', 'Python', 'Julia', 'Java', 'Erlang')
str(proglangs)
## chr [1:5] "R" "Python" "Julia" "Java" "Erlang"
proglangs_modified = sapply(proglangs, toupper)
str(proglangs_modified)
## Named chr [1:5] "R" "PYTHON" "JULIA" "JAVA" "ERLANG"
## - attr(*, "names")= chr [1:5] "R" "Python" "Julia" "Java" ...
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
Scale continuous columns of dataframe.
ind = sapply(foodData, is.numeric)
foodData[ind] = lapply(foodData[ind], scale)
head(foodData)
Calculate the correlation matrix for continuous features.
cor(foodData[ind])
## OilPercentage Density Fracture Hardness
## OilPercentage 1.00000000 -0.7500240 -0.5337392 -0.09604521
## Density -0.75002399 1.0000000 0.5721324 0.10793720
## Fracture -0.53373917 0.5721324 1.0000000 -0.37335844
## Hardness -0.09604521 0.1079372 -0.3733584 1.00000000