1.Choose and load any R dataset (except for diamonds!) that has at least two numeric variables and at least two categorical variables. Identify which variables your data set are numeric, and which are categorical (factors).
data(ChickWeight)
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' length 3 weight ~ Time | Chick
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' length 2 ~Diet
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time"
## ..$ y: chr "Body weight"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(gm)"
#weight - a numeric vector giving the body weight of the chick (gm).
#Time - a numeric vector giving the number of days since birth when the measurement was made.
#Chick - an ordered factor with levels 18 < ... < 48 giving a unique identifier for the chick.
#Diet - a factor with levels 1, ..., 4 indicating which experimental diet the chick received.
2.Generate summary level descriptive statistics: Show the mean, median, 25th and 75th quartiles, min, and max for each of the applicable variables in your data set
summary(ChickWeight)
## weight Time Chick Diet
## Min. : 35.0 Min. : 0.00 13 : 12 1:220
## 1st Qu.: 63.0 1st Qu.: 4.00 9 : 12 2:120
## Median :103.0 Median :10.00 20 : 12 3:120
## Mean :121.8 Mean :10.72 10 : 12 4:118
## 3rd Qu.:163.8 3rd Qu.:16.00 17 : 12
## Max. :373.0 Max. :21.00 19 : 12
## (Other):506
3.Determine the frequency for each of one of the categorical variables.
table(ChickWeight$Diet)
##
## 1 2 3 4
## 220 120 120 118
table(ChickWeight$Chick)
##
## 18 16 15 13 9 20 10 8 17 19 4 6 11 3 1 12 2 5 14 7 24 30 22 23 27
## 2 7 8 12 12 12 12 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
## 28 26 25 29 21 33 37 36 31 39 38 32 40 34 35 44 45 43 41 47 49 46 50 42 48
## 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 10 12 12 12 12 12 12 12 12 12
4.Determine the frequency for each of the one of the categorical variables, by a different categorical variable.
table(ChickWeight$Diet, ChickWeight$Chick)
##
## 18 16 15 13 9 20 10 8 17 19 4 6 11 3 1 12 2 5 14 7 24 30 22
## 1 2 7 8 12 12 12 12 11 12 12 12 12 12 12 12 12 12 12 12 12 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 12 12
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 23 27 28 26 25 29 21 33 37 36 31 39 38 32 40 34 35 44 45 43 41 47 49
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 12 12 12 12 12 12 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 12 12 12 12 12 12 12 12 12 12 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 12 12 12 12 12
##
## 46 50 42 48
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 12 12 12 12
table(ChickWeight$Chick, ChickWeight$Diet)
##
## 1 2 3 4
## 18 2 0 0 0
## 16 7 0 0 0
## 15 8 0 0 0
## 13 12 0 0 0
## 9 12 0 0 0
## 20 12 0 0 0
## 10 12 0 0 0
## 8 11 0 0 0
## 17 12 0 0 0
## 19 12 0 0 0
## 4 12 0 0 0
## 6 12 0 0 0
## 11 12 0 0 0
## 3 12 0 0 0
## 1 12 0 0 0
## 12 12 0 0 0
## 2 12 0 0 0
## 5 12 0 0 0
## 14 12 0 0 0
## 7 12 0 0 0
## 24 0 12 0 0
## 30 0 12 0 0
## 22 0 12 0 0
## 23 0 12 0 0
## 27 0 12 0 0
## 28 0 12 0 0
## 26 0 12 0 0
## 25 0 12 0 0
## 29 0 12 0 0
## 21 0 12 0 0
## 33 0 0 12 0
## 37 0 0 12 0
## 36 0 0 12 0
## 31 0 0 12 0
## 39 0 0 12 0
## 38 0 0 12 0
## 32 0 0 12 0
## 40 0 0 12 0
## 34 0 0 12 0
## 35 0 0 12 0
## 44 0 0 0 10
## 45 0 0 0 12
## 43 0 0 0 12
## 41 0 0 0 12
## 47 0 0 0 12
## 49 0 0 0 12
## 46 0 0 0 12
## 50 0 0 0 12
## 42 0 0 0 12
## 48 0 0 0 12
5.Create a graph for a single numeric variable.
#Boxplot of Chicken weight
boxplot(ChickWeight$weight, xlab="ChickenWeight", main="Distribution of Chicken Weight")
#Histogram of Chicken weight
hist(ChickWeight$weight, xlab="ChickenWeight", main="Distribution of Chicken Weight")
#Add a density distribution line over the histogram using lines function.
hist(ChickWeight$weight, freq=FALSE, xlab="ChickenWeight", main="Distribution of Chicken Weight")
lines(density(ChickWeight$weight))
#Histogram with a normal density curve using curve
hist(ChickWeight$weight, freq=FALSE, xlab="ChickenWeight", main="Distribution of Chicken Weight",
col="lightgreen")
curve(dnorm(x, mean=mean(ChickWeight$weight), sd=sd(ChickWeight$weight)), add=TRUE, col="darkblue", lwd=2)
#Using ggplot2
qplot(ChickWeight$weight, data=ChickWeight)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
6.Create a scatterplot of two numeric variables.
#using base R
plot(ChickWeight$weight, ChickWeight$Time, xlab="Chicken Weight", ylab="Time")
#Same in ggplot2
qplot(ChickWeight$weight, ChickWeight$Time, data=ChickWeight, xlab="Chicken Weight", ylab="Time")