Webinar-1 May 2024

Execute the following cells to load the libraries

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Creating homeogenous datastructures: vectors and matrices (built on the fundamental datatypes character, numeric, integer, and logical)

myvector1 = c(1,2,3)
myvector2 = c('a', 'b', 'c')
myvector3 = c(TRUE, FALSE, TRUE)
myvector4 = c(1.5, 2.5, 3.5)
myvector5 = seq(1, 10)
myvector6 = 1:10
myvector7 = seq(1, 10, by = 0.5)
print(myvector1)

## [1] 1 2 3

print(myvector2)

## [1] "a" "b" "c"

print(myvector3)

## [1]  TRUE FALSE  TRUE

print(myvector4)

## [1] 1.5 2.5 3.5

print(myvector5)

##  [1]  1  2  3  4  5  6  7  8  9 10

print(myvector6)

##  [1]  1  2  3  4  5  6  7  8  9 10

print(myvector7)

##  [1]  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0
## [16]  8.5  9.0  9.5 10.0

mymatrix1 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3)
mymatrix2 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3, byrow = TRUE)
print(mymatrix1)

##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

print(mymatrix2)

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6

Creating a list, a heterogenous datastructure

mylist1 = list(1, 'Name', c('Math', 'Physics', 'Chemistry'))
print(mylist1)

## [[1]]
## [1] 1
## 
## [[2]]
## [1] "Name"
## 
## [[3]]
## [1] "Math"      "Physics"   "Chemistry"

Accessing elements of a datastructure

myvector1[1]

## [1] 1

myvector7[10]

## [1] 5.5

mymatrix2[1, 3]

## [1] 3

mymatrix2[1, ]

## [1] 1 2 3

mymatrix2[, 2]

## [1] 2 5

Loading data into a dataframe, a heterogenous datastructure

#file = 'http://openmv.net/file/food-texture.csv'
file = 'food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)
head(foodData, 5)

Attributes of a dataframe

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ Oil     : num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy  : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture: int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness: int  97 139 143 95 143 189 114 63 123 132 ...

nrow(foodData)

## [1] 50

ncol(foodData)

## [1] 5

colnames(foodData)

## [1] "Oil"      "Density"  "Crispy"   "Fracture" "Hardness"

rownames(foodData)

##  [1] "B110" "B136" "B171" "B192" "B225" "B237" "B261" "B264" "B353" "B360"
## [11] "B366" "B377" "B391" "B397" "B404" "B437" "B445" "B462" "B485" "B488"
## [21] "B502" "B554" "B556" "B575" "B576" "B605" "B612" "B615" "B649" "B665"
## [31] "B674" "B692" "B694" "B719" "B727" "B758" "B776" "B799" "B836" "B848"
## [41] "B861" "B869" "B876" "B882" "B889" "B907" "B911" "B923" "B971" "B998"

Get the data type and the data structure associated with an object

typeof(myvector1)

## [1] "double"

typeof(myvector2)

## [1] "character"

str(myvector1)

##  num [1:3] 1 2 3

str(myvector2)

##  chr [1:3] "a" "b" "c"

str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ Oil     : num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy  : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture: int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness: int  97 139 143 95 143 189 114 63 123 132 ...

Accessing elements of a data frame

foodData$Oil

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData[['Oil']]

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData[, 'Oil']

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData['Oil']

foodData[1, 'Oil']

## [1] 16.5

foodData['B110', 'Crispy']

## [1] 10

####Rename oilcolumn to oil percentage

foodData=foodData %>% rename(OilPercentage=Oil)
str(foodData)

## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

Accessing elements of a particular column of a dataframe

foodData$OilPercentage

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData[['OilPercentage']]

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData[,'OilPercentage']

##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3

foodData['OilPercentage']

####Accessing multiple columns of dataframe using base r and dplyr

1.####Accessing multiple columns of dataframe using base r

foodData[c('OilPercentage','Density')]

#foodData[-c('OilPercentage','Density')]

2.####Accessing multiple columns of dataframe using dplyr

foodData %>% select(c(OilPercentage,Density))

foodData %>% select(-c(OilPercentage,Density))

Accessing rows of a dataframe satisfying certain conditions

foodData$OilPercentage>=16.5

##  [1]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [49]  TRUE FALSE

foodData[foodData$OilPercentage>=16.5,'Crispy']

##  [1] 10 14 10 13 13 10 12 12 14 14 12 13 14 13  8 13 12 10 10 13 12 10 13 14 13
## [26] 14 11 10 10 12 12 11 13

foodData$Density<=2900

##  [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [49]  TRUE  TRUE

foodData$OilPercentage>=16.5& foodData$Density<=2900

##  [1] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [37]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [49]  TRUE FALSE

foodData[foodData$OilPercentage>=16.5& foodData$Density<=2900,'Crispy']

##  [1] 14 13 13 10 12 12 14 14 12 13 14 13  8 13 12 13 12 10 13 14 13 14 11 12 11
## [26] 13

foodData %>% filter(OilPercentage>= 16.5 & Density <=2900)

Accessing rows of a dataframe satisfying certain conditions and particular column

#access all samples of foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) and need to select fractures and hardness
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) #it has all the samples

foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness))

# accessing samples with fracture and hardness among Oilpercentage>=16.5 & Density <=2900
#for exam 
#foodData %>% ?(?>= 16.5 & Density <=2900) %>% select (c(Fracture, ?))
#explain the code
#foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness))

#### Accessing rows of a dataframe satisfying certain conditions

#filter samples with crispy index 9 0r 15(which are very rare)
foodData$Crispy==9 | foodData$Crispy==15 # its gives only a set of true and false values which is difficult to read

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE

sum(foodData$Crispy==9 | foodData$Crispy==15) # its gives the sum of true values only

## [1] 2

foodData[foodData$Crispy==9 | foodData$Crispy==15,'Density'] #gives the density values of data satisfying these conditions

## [1] 2930 2770

foodData[foodData$Crispy==9 | foodData$Crispy==15,] #gives all the columns in the data satisfying the conditions foodData$Crispy==9 | foodData$Crispy==15

#### how to access the data using dplyr
foodData %>% filter(Crispy %in% c(9,15))

foodData %>% filter(!(Crispy %in% c(9,15))) # filtering crispy values not in 9 and 15.

Modifying a column of a dataframe

foodData %>% mutate(Density=Density*1e-9) # change the density values from density to density*10^(-9)

#here we cannot change the original dataframe. if we want to change the dataframe, we need to overwrite then.
#foodData=foodData %>% mutate(Density=Density*1e-9)
foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))# to create a new column called Density level and generate high if the density is >2850 using ifelse statement.

#to make it into a dataframe use
foodData=foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))
str(foodData)

## 'data.frame':    50 obs. of  6 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...
##  $ Densitylevel : chr  "High" "Low" "High" "High" ...

####Change Crispy and Density level columns to factor(categorical) type

categorical_cols=c('Crispy','Densitylevel')
continuous_cols=c('OilPercentage','Density','Fracture','Hardness')
foodData[categorical_cols]=lapply(foodData[categorical_cols],factor)
str(foodData)

## 'data.frame':    50 obs. of  6 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : Factor w/ 9 levels "7","8","9","10",..: 4 8 6 4 5 7 7 4 5 5 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...
##  $ Densitylevel : Factor w/ 2 levels "High","Low": 1 2 1 1 1 2 2 2 1 1 ...

foodData$Densitylevel #high low categorical columns

##  [1] High Low  High High High Low  Low  Low  High High Low  Low  High High High
## [16] Low  Low  High Low  Low  High High Low  Low  Low  High High High Low  Low 
## [31] High Low  High Low  High Low  Low  Low  High High High High High High High
## [46] High High High Low  Low 
## Levels: High Low

contrasts(foodData$Densitylevel) #dummy encoding

##      Low
## High   0
## Low    1

levels(foodData$Densitylevel) #to check the number of levels label encoding

## [1] "High" "Low"

levels(foodData$Crispy) #to eheck the number of crispy level label encoding

## [1] "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"

foodData$Crispy

##  [1] 10 14 12 10 11 13 13 10 11 11 12 12 14 12 9  15 14 12 13 14 10 10 13 8  13
## [26] 12 10 10 13 12 10 10 7  13 12 14 13 14 11 10 10 12 11 8  12 11 12 10 13 10
## Levels: 7 8 9 10 11 12 13 14 15

categorical_cols

## [1] "Crispy"       "Densitylevel"

setdiff(colnames(foodData),categorical_cols) #shows all the column names but remove the categorical columns

## [1] "OilPercentage" "Density"       "Fracture"      "Hardness"