Execute the following cells to load the libraries

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Creating homeogenous datastructures: vectors and matrices (built on the fundamental datatypes character, numeric, integer, and logical)

myvector1 = c(1,2,3)
myvector2 = c('a', 'b', 'c')
myvector3 = c(TRUE, FALSE, TRUE)
myvector4 = c(1.5, 2.5, 3.5)
myvector5 = seq(1, 10)
myvector6 = 1:10
myvector7 = seq(1, 10, by = 0.5)
print(myvector1)
## [1] 1 2 3
print(myvector2)
## [1] "a" "b" "c"
print(myvector3)
## [1]  TRUE FALSE  TRUE
print(myvector4)
## [1] 1.5 2.5 3.5
print(myvector5)
##  [1]  1  2  3  4  5  6  7  8  9 10
print(myvector6)
##  [1]  1  2  3  4  5  6  7  8  9 10
print(myvector7)
##  [1]  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0
## [16]  8.5  9.0  9.5 10.0
mymatrix1 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3)
mymatrix2 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3, byrow = TRUE)
print(mymatrix1)
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
print(mymatrix2)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6

Creating a list, a heterogenous datastructure

mylist1 = list(1, 'Name', c('Math', 'Physics', 'Chemistry'))
print(mylist1)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] "Name"
## 
## [[3]]
## [1] "Math"      "Physics"   "Chemistry"

Accessing elements of a datastructure

myvector1[1]
## [1] 1
myvector7[10]
## [1] 5.5
mymatrix2[1, 3]
## [1] 3
mymatrix2[1, ]
## [1] 1 2 3
mymatrix2[, 2]
## [1] 2 5

Loading data into a dataframe, a heterogenous datastructure

#file = 'http://openmv.net/file/food-texture.csv'
file = 'food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)
head(foodData, 5)

Attributes of a dataframe

str(foodData)
## 'data.frame':    50 obs. of  5 variables:
##  $ Oil     : num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy  : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture: int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness: int  97 139 143 95 143 189 114 63 123 132 ...
nrow(foodData)
## [1] 50
ncol(foodData)
## [1] 5
colnames(foodData)
## [1] "Oil"      "Density"  "Crispy"   "Fracture" "Hardness"
rownames(foodData)
##  [1] "B110" "B136" "B171" "B192" "B225" "B237" "B261" "B264" "B353" "B360"
## [11] "B366" "B377" "B391" "B397" "B404" "B437" "B445" "B462" "B485" "B488"
## [21] "B502" "B554" "B556" "B575" "B576" "B605" "B612" "B615" "B649" "B665"
## [31] "B674" "B692" "B694" "B719" "B727" "B758" "B776" "B799" "B836" "B848"
## [41] "B861" "B869" "B876" "B882" "B889" "B907" "B911" "B923" "B971" "B998"

Get the data type and the data structure associated with an object

typeof(myvector1)
## [1] "double"
typeof(myvector2)
## [1] "character"
str(myvector1)
##  num [1:3] 1 2 3
str(myvector2)
##  chr [1:3] "a" "b" "c"
str(foodData)
## 'data.frame':    50 obs. of  5 variables:
##  $ Oil     : num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy  : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture: int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness: int  97 139 143 95 143 189 114 63 123 132 ...

Accessing elements of a data frame

foodData$Oil
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[['Oil']]
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[, 'Oil']
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData['Oil']
foodData[1, 'Oil']
## [1] 16.5
foodData['B110', 'Crispy']
## [1] 10

####Rename oilcolumn to oil percentage

foodData=foodData %>% rename(OilPercentage=Oil)
str(foodData)
## 'data.frame':    50 obs. of  5 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...

Accessing elements of a particular column of a dataframe

foodData$OilPercentage
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[['OilPercentage']]
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[,'OilPercentage']
##  [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData['OilPercentage']

####Accessing multiple columns of dataframe using base r and dplyr

1.####Accessing multiple columns of dataframe using base r

foodData[c('OilPercentage','Density')]
#foodData[-c('OilPercentage','Density')]

2.####Accessing multiple columns of dataframe using dplyr

foodData %>% select(c(OilPercentage,Density))
foodData %>% select(-c(OilPercentage,Density))

Accessing rows of a dataframe satisfying certain conditions

foodData$OilPercentage>=16.5
##  [1]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [49]  TRUE FALSE
foodData[foodData$OilPercentage>=16.5,'Crispy']
##  [1] 10 14 10 13 13 10 12 12 14 14 12 13 14 13  8 13 12 10 10 13 12 10 13 14 13
## [26] 14 11 10 10 12 12 11 13
foodData$Density<=2900
##  [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [49]  TRUE  TRUE
foodData$OilPercentage>=16.5& foodData$Density<=2900
##  [1] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [37]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [49]  TRUE FALSE
foodData[foodData$OilPercentage>=16.5& foodData$Density<=2900,'Crispy']
##  [1] 14 13 13 10 12 12 14 14 12 13 14 13  8 13 12 13 12 10 13 14 13 14 11 12 11
## [26] 13
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900)

Accessing rows of a dataframe satisfying certain conditions and particular column

#access all samples of foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) and need to select fractures and hardness
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) #it has all the samples
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness)) 
# accessing samples with fracture and hardness among Oilpercentage>=16.5 & Density <=2900
#for exam 
#foodData %>% ?(?>= 16.5 & Density <=2900) %>% select (c(Fracture, ?))
#explain the code
#foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness))

#### Accessing rows of a dataframe satisfying certain conditions

#filter samples with crispy index 9 0r 15(which are very rare)
foodData$Crispy==9 | foodData$Crispy==15 # its gives only a set of true and false values which is difficult to read
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE
sum(foodData$Crispy==9 | foodData$Crispy==15) # its gives the sum of true values only
## [1] 2
foodData[foodData$Crispy==9 | foodData$Crispy==15,'Density'] #gives the density values of data satisfying these conditions
## [1] 2930 2770
foodData[foodData$Crispy==9 | foodData$Crispy==15,] #gives all the columns in the data satisfying the conditions foodData$Crispy==9 | foodData$Crispy==15
#### how to access the data using dplyr
foodData %>% filter(Crispy %in% c(9,15))
foodData %>% filter(!(Crispy %in% c(9,15))) # filtering crispy values not in 9 and 15.

Modifying a column of a dataframe

foodData %>% mutate(Density=Density*1e-9) # change the density values from density to density*10^(-9)
#here we cannot change the original dataframe. if we want to change the dataframe, we need to overwrite then.
#foodData=foodData %>% mutate(Density=Density*1e-9)
foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))# to create a new column called Density level and generate high if the density is >2850 using ifelse statement.
#to make it into a dataframe use
foodData=foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))
str(foodData)
## 'data.frame':    50 obs. of  6 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : int  10 14 12 10 11 13 13 10 11 11 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...
##  $ Densitylevel : chr  "High" "Low" "High" "High" ...

####Change Crispy and Density level columns to factor(categorical) type

categorical_cols=c('Crispy','Densitylevel')
continuous_cols=c('OilPercentage','Density','Fracture','Hardness')
foodData[categorical_cols]=lapply(foodData[categorical_cols],factor)
str(foodData)
## 'data.frame':    50 obs. of  6 variables:
##  $ OilPercentage: num  16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
##  $ Density      : int  2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
##  $ Crispy       : Factor w/ 9 levels "7","8","9","10",..: 4 8 6 4 5 7 7 4 5 5 ...
##  $ Fracture     : int  23 9 17 31 26 16 17 26 23 24 ...
##  $ Hardness     : int  97 139 143 95 143 189 114 63 123 132 ...
##  $ Densitylevel : Factor w/ 2 levels "High","Low": 1 2 1 1 1 2 2 2 1 1 ...
foodData$Densitylevel #high low categorical columns
##  [1] High Low  High High High Low  Low  Low  High High Low  Low  High High High
## [16] Low  Low  High Low  Low  High High Low  Low  Low  High High High Low  Low 
## [31] High Low  High Low  High Low  Low  Low  High High High High High High High
## [46] High High High Low  Low 
## Levels: High Low
contrasts(foodData$Densitylevel) #dummy encoding
##      Low
## High   0
## Low    1
levels(foodData$Densitylevel) #to check the number of levels label encoding
## [1] "High" "Low"
levels(foodData$Crispy) #to eheck the number of crispy level label encoding
## [1] "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
foodData$Crispy
##  [1] 10 14 12 10 11 13 13 10 11 11 12 12 14 12 9  15 14 12 13 14 10 10 13 8  13
## [26] 12 10 10 13 12 10 10 7  13 12 14 13 14 11 10 10 12 11 8  12 11 12 10 13 10
## Levels: 7 8 9 10 11 12 13 14 15
categorical_cols
## [1] "Crispy"       "Densitylevel"
setdiff(colnames(foodData),categorical_cols) #shows all the column names but remove the categorical columns
## [1] "OilPercentage" "Density"       "Fracture"      "Hardness"