This famous (Fisher’s or Anderson’s) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica. Type answers after the -> sign.
library(datasets)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
-> continuous numerical
-> factor nominal
#View just the petal length values
#FINISH THE FOLLOWING CODE BY TYPE VARIABLE Petal.Length AFTER DOLLAR SIGN
iris$Petal.Length
## [1] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
## [19] 1.7 1.5 1.7 1.5 1.0 1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
## [37] 1.3 1.4 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7 4.5 4.9 4.0
## [55] 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.0 4.7 3.6 4.4 4.5 4.1 4.5 3.9 4.8 4.0
## [73] 4.9 4.7 4.3 4.4 4.8 5.0 4.5 3.5 3.8 3.7 3.9 5.1 4.5 4.5 4.7 4.4 4.1 4.0
## [91] 4.4 4.6 4.0 3.3 4.2 4.2 4.2 4.3 3.0 4.1 6.0 5.1 5.9 5.6 5.8 6.6 4.5 6.3
## [109] 5.8 6.1 5.1 5.3 5.5 5.0 5.1 5.3 5.5 6.7 6.9 5.0 5.7 4.9 6.7 4.9 5.7 6.0
## [127] 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9
## [145] 5.7 5.2 5.0 5.2 5.4 5.1
-> we should definitely summarize or sort, this is very hard to read and analyze
#Get summary statistics for just the petal lengths
summary(iris$Petal.Length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.600 4.350 3.758 5.100 6.900
-> 3.758
-> Min. 1st Qu. Median 3rd Qu. Max. 1.000 1.600 4.350 5.100 6.900
-> 3.5 cm
-> no upper or lower outliers
#Get summary for all numeric (but only numeric) variables
summary(iris[sapply(iris, is.numeric)])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
-> sepal
-> 5.843
-> petal length
-> 3.5 cm
-> petal.length
-> 5.9 cm
-> 6.4 cm
#Use individual R functions to get specific statistics
mean(iris$Petal.Length) #Find mean
## [1] 3.758
var(iris$Petal.Length) #Find variance
## [1] 3.116278
sd(iris$Petal.Length) #Find standard deviation
## [1] 1.765298
median(iris$Petal.Length) #Find median
## [1] 4.35
IQR(iris$Petal.Length) #Find interquartile range
## [1] 3.5
-> 1.77
-> the petal lengths differ from the mean on an average of 1.77cm.
#Method A for getting an individual z-score
#Get z-score for an iris flower with a petal length of 1.1 cm by using mean & sd functions to write an expression (1.1-mean)/sd
z_score1.1 <- (1.1-mean(iris$Petal.Length, na.rm = TRUE))/sd(iris$Petal.Length, na.rm = TRUE) #Use x-mean()/sd() (na.rm=TRUE is only necessary if there are missing values, so we could have left it out, because we do not have any)
z_score1.1 #Display z-score
## [1] -1.505695
-> the petal length of 1.1cm is 1.51 standard deviations below the mean.
#Method B for getting an individual z-score
#Get z-score for an iris flower with a petal length of 6.9 cm by saving mean & sd as objects
m <- mean(iris$Petal.Length) #Save mean as object named m
s <- sd(iris$Petal.Length) #Save std dev as object named s
z_score6.9 <- (6.9 - m)/s #Save z-score
z_score6.9 #Display z-score
## [1] 1.779869
-> the petal length of 6.9cm is 1.78 standard deviations above the mean.
quantile(iris$Petal.Length, probs = c(.25, .5, .75)) #Find the quartiles: Q1, median, and Q3
## 25% 50% 75%
## 1.60 4.35 5.10
-> yes, they are the same.
quantile(iris$Petal.Length, probs = c(.35, .67, .85, .99)) #Find the 35th, 67th, 85th, and 99th percentiles
## 35% 67% 85% 99%
## 3.33 4.90 5.60 6.70
-> 85% of the iris petal lengths are 5.6 cm or less
#Get a histogram for all of the iris petal lengths
hist(iris$Petal.Length, #Run histogram function on Petal Lengths
main="Histogram of Iris Petal Lengths", #Add title
xlab="Petal Lengths", #Add x-axis label
border="thistle4", #Color of bin outlines
col="hotpink1", #Bin colors
las=1,) #Position of x-axis numbers
-> it is probably because all three species may not have the same length distributions.
#Get Species category names, so we can separate the petal lengths by species
levels(iris$Species)
## [1] "setosa" "versicolor" "virginica"
#Subset the iris data as a smaller data set with just the petal length and widths of the setosa flowers
setosa <- subset(iris, Species == "setosa", select = c(Petal.Length, Petal.Width))
str(setosa)
## 'data.frame': 50 obs. of 2 variables:
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#Subset the iris data as a smaller data set with just the petal length and widths of the versicolor flowers
versicolor <- subset(iris, Species == "versicolor", select = c(Petal.Length, Petal.Width))
str(versicolor)
## 'data.frame': 50 obs. of 2 variables:
## $ Petal.Length: num 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
## $ Petal.Width : num 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
#Subset the iris data as a smaller data set with just the petal length and widths of the virginica flowers
virginica <- subset(iris, Species == "virginica", select = c(Petal.Length, Petal.Width))
str(virginica)
## 'data.frame': 50 obs. of 2 variables:
## $ Petal.Length: num 6 5.1 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 ...
## $ Petal.Width : num 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8 1.8 2.5 ...
#Get histograms for the petal lengths of the 3 flower species
hist(setosa$Petal.Length, #Run histogram function on Petal Lengths
main="Histogram of Setosa Petal Lengths") #Add title
hist(versicolor$Petal.Length, #Run histogram function on Petal Lengths
main="Histogram of Versicolor Petal Lengths") #Add title
hist(virginica$Petal.Length, #Run histogram function on Petal Lengths
main="Histogram of Virginica Petal Lengths") #Add title
-> it is easier to see their distributions and ranges
-> they have a smaller petal length compared to the other 2 species
-> they have similar petal lengths with virginica, but are slightly lower
-> they have the higher petal length compared to the other 2 species
library(graphics)
# Get basic stem-and-leaf plot for all the petal lengths; R will make a split stem-and-leaf plot for this data, though
stem(iris$Petal.Length)
##
## The decimal point is at the |
##
## 1 | 012233333334444444444444
## 1 | 55555555555556666666777799
## 2 |
## 2 |
## 3 | 033
## 3 | 55678999
## 4 | 000001112222334444
## 4 | 5555555566677777888899999
## 5 | 000011111111223344
## 5 | 55566666677788899
## 6 | 0011134
## 6 | 6779
-> it is displaying similar information
# Create boxplot for all of the petal lengths
boxplot(iris$Petal.Length, #Select data and variable
main = "Box Plot of Petal Length", #Add title
xlab= "Petal Lengths", #Add x-axis label
ylab= "Frequency") #Add y-axis label
-> no, there are no outliers
-> yes.