(a) Find and get a dataset from the datasets available within R.

Perform exploratory data analysis (EDA) and prepare a codebook on that dataset using a newer method in R.

Loading iris data set

library(datasets)
data(iris)

Peek at data

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Iris data set contains around 150 observations on three species of iris flower: setosa, versicolor and virginica. Every observation contains four measurements of flower’s Petal length, Petal width, Sepal length and Sepal width.

Quick data visualization

Histogram

Basic Histogram

hist(iris$Sepal.Length, 
     col = "lightgreen",            #col argument is used to change the foreground colour of symbols.
     main = "Sepal Length of Iris"  #Add your own title
    )

hist(iris$Sepal.Width, col = "darkgreen", main = "Sepal Width of Iris")

hist(iris$Petal.Length, col = "pink", main = "Petal Length of Iris")

hist(iris$Petal.Width, col = "magenta", main = "Petal Width of Iris")

Histogram By Group

Example: Petal Width by Species

# Put graph in 3 rows and 1 column
par(mfrow = c(3,1))

# Histogram for each species using options
hist(iris$Petal.Width [iris$Species == "setosa"],
     xlim = c(0, 3),
     breaks = 9, 
     main = "Petal Width for Setosa",
     xlab = "",
     col = "brown1")  

hist(iris$Petal.Width [iris$Species == "versicolor"],
     xlim = c(0, 3),
     breaks = 9,
     main = "Petal Width for Versicolor",
     xlab = "",
     col = "dodgerblue1")

hist(iris$Petal.Width [iris$Species == "virginica"],
     xlim = c(0, 3),
     breaks = 9,
     main = "Petal Width for Virginica",
     xlab = "",
     col = "limegreen")

library(DataExplorer)
plot_histogram(iris)

Boxplot

Basic Boxplot

Example r code of Sepal Length by Species

boxplot(iris$Sepal.Length ~ iris$Species,
        ylab = "Sepal Length",              #y-axis label
        xlab = "Species",                   #x-axis label
        main = "Sepal Length by Species")

Scatter plot

Basic Scatter Plot

To create a scatter plot just specify any two variables of the data set in plot() function

plot(iris$Petal.Length, iris$Petal.Width,
     pch=16,   #pch(plotting character) argument is used to specify symbols to use when plotting points
     cex=0.6,  #cex(character expansion) argument is used to change the size of the plotted characters
     col = "blue",
     main = "Iris Flower Data Set",
     xlab = "Petal Length",    
     ylab = "Petal Width",     
     )

Scatter Plot of Multiple Groups

Following example uses the pch argument to plot each point with a different plotting character, according to the parallel factor “Species”.

# A scatter plot that shows the points in groups according to their "species"
plot(Petal.Width ~ Petal.Length, data=iris,
     col=c("brown1","dodgerblue1","limegreen")[as.integer(Species)],
     pch=c(1,2,3)[as.integer(Species)])

legend(x="topleft",
       legend=c("setosa","versicolor","virginica"),
       col=c("brown1","dodgerblue1","limegreen"),
       pch=c(1,2,3))

Codebook (Using newer method in R)

class(iris)
## [1] "data.frame"
sapply(iris, class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

(b) Demonstrate these FIVE (5) functions of dplyr for data manipulation:

# Set directory path
setwd("C:/Users/kelee/Downloads")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read csv file
data = read.csv("Student - Sheet1.csv")
data
##             Name Age Gender heightInCM weightInKG
## 1    Lee Keat En  19 female        150         51
## 2       John Doe  22   male        185         70
## 3   Siti Rokhiah  18 female        155         48
## 4         Amelia  20 female        158         50
## 5       Isabella  19 female        160         61
## 6            Mia  17 female        148         42
## 7        William  18   male        180         68
## 8          James  21   male        183         73
## 9          Lucas  16   male        169         63
## 10          Lucy  18 female        999        300
## 11        Elijah  15 female        153         45
## 12         David  22   male        173         77
## 13         Asher  19 female        161         56
## 14         Bella  23 female        158         55
## 15        Sophie  17 female        155         50
## 16          Aziq  18   male        188         78
## 17 Jackson Chong  16   male        181         76
## 18         Tammy  20   male        178         75
## 19        Eunice  22   male        154         50

filter()

Exclude “Lucy” row 10, as her height (999cm) and weight (300kg) is invalid data.

data1=filter(data, !grepl("Lucy", Name))
data1
##             Name Age Gender heightInCM weightInKG
## 1    Lee Keat En  19 female        150         51
## 2       John Doe  22   male        185         70
## 3   Siti Rokhiah  18 female        155         48
## 4         Amelia  20 female        158         50
## 5       Isabella  19 female        160         61
## 6            Mia  17 female        148         42
## 7        William  18   male        180         68
## 8          James  21   male        183         73
## 9          Lucas  16   male        169         63
## 10        Elijah  15 female        153         45
## 11         David  22   male        173         77
## 12         Asher  19 female        161         56
## 13         Bella  23 female        158         55
## 14        Sophie  17 female        155         50
## 15          Aziq  18   male        188         78
## 16 Jackson Chong  16   male        181         76
## 17         Tammy  20   male        178         75
## 18        Eunice  22   male        154         50

arange()

Arange the age of students in ascending order

data2=arrange(data1, Age)
data2
##             Name Age Gender heightInCM weightInKG
## 1         Elijah  15 female        153         45
## 2          Lucas  16   male        169         63
## 3  Jackson Chong  16   male        181         76
## 4            Mia  17 female        148         42
## 5         Sophie  17 female        155         50
## 6   Siti Rokhiah  18 female        155         48
## 7        William  18   male        180         68
## 8           Aziq  18   male        188         78
## 9    Lee Keat En  19 female        150         51
## 10      Isabella  19 female        160         61
## 11         Asher  19 female        161         56
## 12        Amelia  20 female        158         50
## 13         Tammy  20   male        178         75
## 14         James  21   male        183         73
## 15      John Doe  22   male        185         70
## 16         David  22   male        173         77
## 17        Eunice  22   male        154         50
## 18         Bella  23 female        158         55

mutate()

##This function will add a new variable which is “BMI” to the dataset

heightWeight = select(data2, heightInCM, weightInKG)

data3 <- heightWeight%>% 
  mutate(BMI = weightInKG / ((heightInCM / 100)  ^ 2))
data3
##    heightInCM weightInKG      BMI
## 1         153         45 19.22338
## 2         169         63 22.05805
## 3         181         76 23.19832
## 4         148         42 19.17458
## 5         155         50 20.81165
## 6         155         48 19.97919
## 7         180         68 20.98765
## 8         188         78 22.06881
## 9         150         51 22.66667
## 10        160         61 23.82812
## 11        161         56 21.60410
## 12        158         50 20.02884
## 13        178         75 23.67125
## 14        183         73 21.79820
## 15        185         70 20.45289
## 16        173         77 25.72756
## 17        154         50 21.08281
## 18        158         55 22.03173

select()

##This function will only select and show the values of column ‘Name’ and ‘Age’ oand ‘Gender’

data4 <- select(data2, c(Name, Age, Gender))
data4
##             Name Age Gender
## 1         Elijah  15 female
## 2          Lucas  16   male
## 3  Jackson Chong  16   male
## 4            Mia  17 female
## 5         Sophie  17 female
## 6   Siti Rokhiah  18 female
## 7        William  18   male
## 8           Aziq  18   male
## 9    Lee Keat En  19 female
## 10      Isabella  19 female
## 11         Asher  19 female
## 12        Amelia  20 female
## 13         Tammy  20   male
## 14         James  21   male
## 15      John Doe  22   male
## 16         David  22   male
## 17        Eunice  22   male
## 18         Bella  23 female
#Combine previous data
dataFinal <- cbind(data4, data3)
dataFinal
##             Name Age Gender heightInCM weightInKG      BMI
## 1         Elijah  15 female        153         45 19.22338
## 2          Lucas  16   male        169         63 22.05805
## 3  Jackson Chong  16   male        181         76 23.19832
## 4            Mia  17 female        148         42 19.17458
## 5         Sophie  17 female        155         50 20.81165
## 6   Siti Rokhiah  18 female        155         48 19.97919
## 7        William  18   male        180         68 20.98765
## 8           Aziq  18   male        188         78 22.06881
## 9    Lee Keat En  19 female        150         51 22.66667
## 10      Isabella  19 female        160         61 23.82812
## 11         Asher  19 female        161         56 21.60410
## 12        Amelia  20 female        158         50 20.02884
## 13         Tammy  20   male        178         75 23.67125
## 14         James  21   male        183         73 21.79820
## 15      John Doe  22   male        185         70 20.45289
## 16         David  22   male        173         77 25.72756
## 17        Eunice  22   male        154         50 21.08281
## 18         Bella  23 female        158         55 22.03173

summary()

summary(dataFinal)
##      Name                Age           Gender            heightInCM   
##  Length:18          Min.   :15.00   Length:18          Min.   :148.0  
##  Class :character   1st Qu.:17.25   Class :character   1st Qu.:155.0  
##  Mode  :character   Median :19.00   Mode  :character   Median :160.5  
##                     Mean   :19.00                      Mean   :166.1  
##                     3rd Qu.:20.75                      3rd Qu.:179.5  
##                     Max.   :23.00                      Max.   :188.0  
##    weightInKG         BMI       
##  Min.   :42.00   Min.   :19.17  
##  1st Qu.:50.00   1st Qu.:20.54  
##  Median :58.50   Median :21.70  
##  Mean   :60.44   Mean   :21.69  
##  3rd Qu.:72.25   3rd Qu.:22.52  
##  Max.   :78.00   Max.   :25.73