Webinar-2

Execute the following cell to load the tidyverse library:

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.1

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.4.0     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.1

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.1

## Warning: package 'dplyr' was built under R version 4.1.1

## Warning: package 'stringr' was built under R version 4.1.1

## Warning: package 'forcats' was built under R version 4.1.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Execute the following cell to load the data. Refer to this website http://openmv.net/info/food-texture for data description:

file = 'http://openmv.net/file/food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1)

Print structure of dataframe

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ Oil     : num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy  : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture: int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness: int  97 139 143 95 143 189 114 63 123 132 ...

Print first 5 samples of data frame

head(foodData, n = 5)

Modify the data frame

# Rename Oil column to OilPercentage
foodData = foodData %>% rename(OilPercentage = Oil)
str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

# Modify crispy column to reflect high (0) and low (1) crispness
foodData = foodData %>% mutate(Crispy = ifelse(Crispy > 11, 'high', 'low'))
head(foodData, 5)

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : chr  "low" "high" "high" "low" ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

str(lapply(foodData['Crispy'], factor))

## List of 1
##  $ Crispy: Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...

proglangs = c('R', 'Python', 'Julia', 'Java', 'Erlang')
str(proglangs)

##  chr [1:5] "R" "Python" "Julia" "Java" "Erlang"

proglangs_modified = lapply(proglangs, toupper)
str(proglangs_modified)

## List of 5
##  $ : chr "R"
##  $ : chr "PYTHON"
##  $ : chr "JULIA"
##  $ : chr "JAVA"
##  $ : chr "ERLANG"

# Change Crispy column to factor type
foodData['Crispy'] = lapply(foodData['Crispy'], factor)

marks = c('high', 'low', 'low', 'high', 'high')
str(marks)

##  chr [1:5] "high" "low" "low" "high" "high"

marks[1]

## [1] "high"

marks_cat = factor(marks)
str(marks_cat)

##  Factor w/ 2 levels "high","low": 1 2 2 1 1

marks_cat[1]

## [1] high
## Levels: high low

Print structure of modified data frame

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

Make a scatter plot between Density (x-axis) and OilPercentage (y-axis). What do you observe from this plot?

p1 = ggplot(data = foodData, aes(x = Density,  y = OilPercentage)) +
  geom_point(size = 1)
p1

Make a scatter plot between Density (x-axis) and OilPercentage (y-axis) color coded using Crispy. What do you observe from this plot?

p2 = ggplot(data = foodData, aes(x = Density, y = OilPercentage, color = Crispy)) +
  geom_point(size = 1) 
p2

Calculate Pearson’s correlation between Density and OilPercentage.

cor(foodData$Density, foodData$OilPercentage, method = 'pearson')

## [1] -0.750024

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

foodData$Density - mean(foodData$Density)

##  [1]   97.4 -197.6   12.4   62.4  117.4  -67.6 -107.6  -87.6   97.4   87.4
## [11]  -27.6  -22.6    2.4  107.4   72.4  -87.6 -207.6   32.4 -162.6 -102.6
## [21]  142.4  122.4  -77.6 -152.6  -32.6   17.4   87.4   62.4  -12.6 -212.6
## [31]  222.4  -32.6  267.4  -77.6   42.4 -287.6 -222.6 -132.6    7.4  117.4
## [41]  122.4   12.4   62.4  242.4   52.4    7.4  137.4   67.4 -157.6  -12.6

sd(foodData$Density)

## [1] 124.5

(foodData$Density - mean(foodData$Density)) / sd(foodData$Density)

##  [1]  0.78232945 -1.58714886  0.09959841  0.50120490  0.94297204 -0.54297198
##  [7] -0.86425717 -0.70361457  0.78232945  0.70200815 -0.22168678 -0.18152613
## [13]  0.01927711  0.86265074  0.58152620 -0.70361457 -1.66747015  0.26024101
## [19] -1.30602431 -0.82409652  1.14377529  0.98313269 -0.62329328 -1.22570301
## [25] -0.26184743  0.13975906  0.70200815  0.50120490 -0.10120484 -1.70763080
## [31]  1.78634568 -0.26184743  2.14779152 -0.62329328  0.34056231 -2.31004054
## [37] -1.78795210 -1.06506042  0.05943776  0.94297204  0.98313269  0.09959841
## [43]  0.50120490  1.94698827  0.42088360  0.05943776  1.10361464  0.54136555
## [49] -1.26586366 -0.10120484

scale(foodData$Density)

##              [,1]
##  [1,]  0.78232945
##  [2,] -1.58714886
##  [3,]  0.09959841
##  [4,]  0.50120490
##  [5,]  0.94297204
##  [6,] -0.54297198
##  [7,] -0.86425717
##  [8,] -0.70361457
##  [9,]  0.78232945
## [10,]  0.70200815
## [11,] -0.22168678
## [12,] -0.18152613
## [13,]  0.01927711
## [14,]  0.86265074
## [15,]  0.58152620
## [16,] -0.70361457
## [17,] -1.66747015
## [18,]  0.26024101
## [19,] -1.30602431
## [20,] -0.82409652
## [21,]  1.14377529
## [22,]  0.98313269
## [23,] -0.62329328
## [24,] -1.22570301
## [25,] -0.26184743
## [26,]  0.13975906
## [27,]  0.70200815
## [28,]  0.50120490
## [29,] -0.10120484
## [30,] -1.70763080
## [31,]  1.78634568
## [32,] -0.26184743
## [33,]  2.14779152
## [34,] -0.62329328
## [35,]  0.34056231
## [36,] -2.31004054
## [37,] -1.78795210
## [38,] -1.06506042
## [39,]  0.05943776
## [40,]  0.94297204
## [41,]  0.98313269
## [42,]  0.09959841
## [43,]  0.50120490
## [44,]  1.94698827
## [45,]  0.42088360
## [46,]  0.05943776
## [47,]  1.10361464
## [48,]  0.54136555
## [49,] -1.26586366
## [50,] -0.10120484
## attr(,"scaled:center")
## [1] 2857.6
## attr(,"scaled:scale")
## [1] 124.5

proglangs = c('R', 'Python', 'Julia', 'Java', 'Erlang')
str(proglangs)

##  chr [1:5] "R" "Python" "Julia" "Java" "Erlang"

proglangs_modified = sapply(proglangs, toupper)
str(proglangs_modified)

##  Named chr [1:5] "R" "PYTHON" "JULIA" "JAVA" "ERLANG"
##  - attr(*, "names")= chr [1:5] "R" "Python" "Julia" "Java" ...

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : Factor w/ 2 levels "high","low": 2 1 1 2 2 1 1 2 2 2 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

Scale continuous columns of dataframe.

ind = sapply(foodData, is.numeric)
foodData[ind] = lapply(foodData[ind], scale)
head(foodData)

Calculate the correlation matrix for continuous features.

cor(foodData[ind])

##               OilPercentage    Density   Fracture    Hardness
## OilPercentage    1.00000000 -0.7500240 -0.5337392 -0.09604521
## Density         -0.75002399  1.0000000  0.5721324  0.10793720
## Fracture        -0.53373917  0.5721324  1.0000000 -0.37335844
## Hardness        -0.09604521  0.1079372 -0.3733584  1.00000000