library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
myvector1 = c(1,2,3)
myvector2 = c('a', 'b', 'c')
myvector3 = c(TRUE, FALSE, TRUE)
myvector4 = c(1.5, 2.5, 3.5)
myvector5 = seq(1, 10)
myvector6 = 1:10
myvector7 = seq(1, 10, by = 0.5)
print(myvector1)
## [1] 1 2 3
print(myvector2)
## [1] "a" "b" "c"
print(myvector3)
## [1] TRUE FALSE TRUE
print(myvector4)
## [1] 1.5 2.5 3.5
print(myvector5)
## [1] 1 2 3 4 5 6 7 8 9 10
print(myvector6)
## [1] 1 2 3 4 5 6 7 8 9 10
print(myvector7)
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0
## [16] 8.5 9.0 9.5 10.0
mymatrix1 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3)
mymatrix2 = matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3, byrow = TRUE)
print(mymatrix1)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
print(mymatrix2)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
mylist1 = list(1, 'Name', c('Math', 'Physics', 'Chemistry'))
print(mylist1)
## [[1]]
## [1] 1
##
## [[2]]
## [1] "Name"
##
## [[3]]
## [1] "Math" "Physics" "Chemistry"
myvector1[1]
## [1] 1
myvector7[10]
## [1] 5.5
mymatrix2[1, 3]
## [1] 3
mymatrix2[1, ]
## [1] 1 2 3
mymatrix2[, 2]
## [1] 2 5
#file = 'http://openmv.net/file/food-texture.csv'
file = 'food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)
head(foodData, 5)
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ Oil : num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture: int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness: int 97 139 143 95 143 189 114 63 123 132 ...
nrow(foodData)
## [1] 50
ncol(foodData)
## [1] 5
colnames(foodData)
## [1] "Oil" "Density" "Crispy" "Fracture" "Hardness"
rownames(foodData)
## [1] "B110" "B136" "B171" "B192" "B225" "B237" "B261" "B264" "B353" "B360"
## [11] "B366" "B377" "B391" "B397" "B404" "B437" "B445" "B462" "B485" "B488"
## [21] "B502" "B554" "B556" "B575" "B576" "B605" "B612" "B615" "B649" "B665"
## [31] "B674" "B692" "B694" "B719" "B727" "B758" "B776" "B799" "B836" "B848"
## [41] "B861" "B869" "B876" "B882" "B889" "B907" "B911" "B923" "B971" "B998"
typeof(myvector1)
## [1] "double"
typeof(myvector2)
## [1] "character"
str(myvector1)
## num [1:3] 1 2 3
str(myvector2)
## chr [1:3] "a" "b" "c"
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ Oil : num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture: int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness: int 97 139 143 95 143 189 114 63 123 132 ...
foodData$Oil
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[['Oil']]
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[, 'Oil']
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData['Oil']
foodData[1, 'Oil']
## [1] 16.5
foodData['B110', 'Crispy']
## [1] 10
####Rename oilcolumn to oil percentage
foodData=foodData %>% rename(OilPercentage=Oil)
str(foodData)
## 'data.frame': 50 obs. of 5 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
foodData$OilPercentage
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[['OilPercentage']]
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData[,'OilPercentage']
## [1] 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 18.0 17.4 18.4 13.9 15.8
## [16] 16.4 18.9 17.3 16.7 19.1 13.7 14.7 18.1 17.2 18.7 18.1 16.6 17.1 17.4 19.4
## [31] 15.9 17.1 15.5 17.7 15.9 21.2 19.5 20.5 17.0 16.7 16.8 16.8 16.3 16.2 18.1
## [46] 16.6 16.4 15.1 21.1 16.3
foodData['OilPercentage']
####Accessing multiple columns of dataframe using base r and dplyr
1.####Accessing multiple columns of dataframe using base r
foodData[c('OilPercentage','Density')]
#foodData[-c('OilPercentage','Density')]
2.####Accessing multiple columns of dataframe using dplyr
foodData %>% select(c(OilPercentage,Density))
foodData %>% select(-c(OilPercentage,Density))
foodData$OilPercentage>=16.5
## [1] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [13] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## [49] TRUE FALSE
foodData[foodData$OilPercentage>=16.5,'Crispy']
## [1] 10 14 10 13 13 10 12 12 14 14 12 13 14 13 8 13 12 10 10 13 12 10 13 14 13
## [26] 14 11 10 10 12 12 11 13
foodData$Density<=2900
## [1] FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [13] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [25] TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [49] TRUE TRUE
foodData$OilPercentage>=16.5& foodData$Density<=2900
## [1] FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [13] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [25] TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [37] TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [49] TRUE FALSE
foodData[foodData$OilPercentage>=16.5& foodData$Density<=2900,'Crispy']
## [1] 14 13 13 10 12 12 14 14 12 13 14 13 8 13 12 13 12 10 13 14 13 14 11 12 11
## [26] 13
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900)
#access all samples of foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) and need to select fractures and hardness
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) #it has all the samples
foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness))
# accessing samples with fracture and hardness among Oilpercentage>=16.5 & Density <=2900
#for exam
#foodData %>% ?(?>= 16.5 & Density <=2900) %>% select (c(Fracture, ?))
#explain the code
#foodData %>% filter(OilPercentage>= 16.5 & Density <=2900) %>% select (c(Fracture, Hardness))
#filter samples with crispy index 9 0r 15(which are very rare)
foodData$Crispy==9 | foodData$Crispy==15 # its gives only a set of true and false values which is difficult to read
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE
sum(foodData$Crispy==9 | foodData$Crispy==15) # its gives the sum of true values only
## [1] 2
foodData[foodData$Crispy==9 | foodData$Crispy==15,'Density'] #gives the density values of data satisfying these conditions
## [1] 2930 2770
foodData[foodData$Crispy==9 | foodData$Crispy==15,] #gives all the columns in the data satisfying the conditions foodData$Crispy==9 | foodData$Crispy==15
#### how to access the data using dplyr
foodData %>% filter(Crispy %in% c(9,15))
foodData %>% filter(!(Crispy %in% c(9,15))) # filtering crispy values not in 9 and 15.
foodData %>% mutate(Density=Density*1e-9) # change the density values from density to density*10^(-9)
#here we cannot change the original dataframe. if we want to change the dataframe, we need to overwrite then.
#foodData=foodData %>% mutate(Density=Density*1e-9)
foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))# to create a new column called Density level and generate high if the density is >2850 using ifelse statement.
#to make it into a dataframe use
foodData=foodData %>% mutate(Densitylevel=ifelse(Density>2850,'High','Low'))
str(foodData)
## 'data.frame': 50 obs. of 6 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : int 10 14 12 10 11 13 13 10 11 11 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
## $ Densitylevel : chr "High" "Low" "High" "High" ...
####Change Crispy and Density level columns to factor(categorical) type
categorical_cols=c('Crispy','Densitylevel')
continuous_cols=c('OilPercentage','Density','Fracture','Hardness')
foodData[categorical_cols]=lapply(foodData[categorical_cols],factor)
str(foodData)
## 'data.frame': 50 obs. of 6 variables:
## $ OilPercentage: num 16.5 17.7 16.2 16.7 16.3 19.1 18.4 17.5 15.7 16.4 ...
## $ Density : int 2955 2660 2870 2920 2975 2790 2750 2770 2955 2945 ...
## $ Crispy : Factor w/ 9 levels "7","8","9","10",..: 4 8 6 4 5 7 7 4 5 5 ...
## $ Fracture : int 23 9 17 31 26 16 17 26 23 24 ...
## $ Hardness : int 97 139 143 95 143 189 114 63 123 132 ...
## $ Densitylevel : Factor w/ 2 levels "High","Low": 1 2 1 1 1 2 2 2 1 1 ...
foodData$Densitylevel #high low categorical columns
## [1] High Low High High High Low Low Low High High Low Low High High High
## [16] Low Low High Low Low High High Low Low Low High High High Low Low
## [31] High Low High Low High Low Low Low High High High High High High High
## [46] High High High Low Low
## Levels: High Low
contrasts(foodData$Densitylevel) #dummy encoding
## Low
## High 0
## Low 1
levels(foodData$Densitylevel) #to check the number of levels label encoding
## [1] "High" "Low"
levels(foodData$Crispy) #to eheck the number of crispy level label encoding
## [1] "7" "8" "9" "10" "11" "12" "13" "14" "15"
foodData$Crispy
## [1] 10 14 12 10 11 13 13 10 11 11 12 12 14 12 9 15 14 12 13 14 10 10 13 8 13
## [26] 12 10 10 13 12 10 10 7 13 12 14 13 14 11 10 10 12 11 8 12 11 12 10 13 10
## Levels: 7 8 9 10 11 12 13 14 15
categorical_cols
## [1] "Crispy" "Densitylevel"
setdiff(colnames(foodData),categorical_cols) #shows all the column names but remove the categorical columns
## [1] "OilPercentage" "Density" "Fracture" "Hardness"