A student evaluation of a teacher is on a 1-5 Leichert scale. Suppose the answers to the first 3 questions are given in this table (not shown)
Enter in the data for question 1 and 2 using c(), scan(), read.table or data.entry()
## Ques.1 Ques.2 Ques.3
## 1 3 5 1
## 2 3 2 3
## 3 3 5 1
## 4 4 5 1
## 5 3 2 1
## 6 4 2 3
## 7 3 5 1
## 8 4 5 1
## 9 3 4 1
## 10 4 2 1
## Question Rating
## 1 Ques.1 3
## 2 Ques.1 3
## 3 Ques.1 3
## 4 Ques.1 4
## 5 Ques.1 3
## 6 Ques.1 4
## 7 Ques.1 3
## 8 Ques.1 4
## 9 Ques.1 3
## 10 Ques.1 4
## 11 Ques.2 5
## 12 Ques.2 2
## 13 Ques.2 5
## 14 Ques.2 5
## 15 Ques.2 2
## 16 Ques.2 2
## 17 Ques.2 5
## 18 Ques.2 5
## 19 Ques.2 4
## 20 Ques.2 2
## 21 Ques.3 1
## 22 Ques.3 3
## 23 Ques.3 1
## 24 Ques.3 1
## 25 Ques.3 1
## 26 Ques.3 3
## 27 Ques.3 1
## 28 Ques.3 1
## 29 Ques.3 1
## 30 Ques.3 1
# Read in Question 1 column and return as table; edit column names
Q1.table <- table(teacher_eval$Ques.1)
names(Q1.table) <- paste(names(Q1.table), c("Rating"), sep = " ")
# Output Question 1 table
Q1.table
## 3 Rating 4 Rating
## 6 4
# Read in Question 2 column and return as table; edit column names
Q2.table <- table(teacher_eval$Ques.2)
names(Q2.table) <- paste(names(Q2.table), c("Rating"), sep = " ")
# Output Question 1 table
Q2.table
## 2 Rating 4 Rating 5 Rating
## 4 1 5
# Read in Question 1 and Question 2 columns into a table
Q1Q2.table <- table(Ques.1 = teacher_eval$Ques.1, Ques.2 = teacher_eval$Ques.2)
# Output frequency table across rows
prop.table(Q1Q2.table, 1)
## Ques.2
## Ques.1 2 4 5
## 3 0.3333333 0.1666667 0.5000000
## 4 0.5000000 0.0000000 0.5000000
# Output frequency table across columns
prop.table(Q1Q2.table, 2)
## Ques.2
## Ques.1 2 4 5
## 3 0.5 1.0 0.6
## 4 0.5 0.0 0.4
# Use alternate data.frame for this question; remove first column
Q2Q3.table <- t(table(teacher_eval_new)[-1,])
# Create barplot on the frequency table (across columns)
barplot(prop.table(Q2Q3.table, 2), col = rainbow(5), legend.text = rownames(Q2Q3.table))
# Similar steps as those in Q3
Q1Q2Q2.table <- t(table(teacher_eval_new))
# Create barplot on the frequency table (across columns)
barplot(prop.table(Q1Q2Q2.table, 2), col = rainbow(5), legend.text = rownames(Q1Q2Q2.table))
In the library MASS is a dataset UScereal which contains information about popular breakfast cereals. Attach the data set as follows:
library("MASS")
data("UScereal")
attach(UScereal)
names(UScereal) # to see the names
## [1] "mfr" "calories" "protein" "fat" "sodium"
## [6] "fibre" "carbo" "sugars" "shelf" "potassium"
## [11] "vitamins"
Now, investigate the following relationships, and make comments on what you see. You can use tables, barplots, scatterplots etc. to do you investigation.
options(digits = 2)
# Percentage of each brand of total cereals
prop.table(table(mfr))
## mfr
## G K N P Q R
## 0.338 0.323 0.046 0.138 0.077 0.077
# Percentage of shelf space
prop.table(table(shelf))
## shelf
## 1 2 3
## 0.28 0.28 0.45
# Column proportions, i.e. what percent of each shelf is represented by each brand
prop.table(table(mfr, shelf), 2)
## shelf
## mfr 1 2 3
## G 0.333 0.389 0.310
## K 0.222 0.389 0.345
## N 0.111 0.000 0.034
## P 0.111 0.056 0.207
## Q 0.000 0.167 0.069
## R 0.222 0.000 0.034
# Row proportions, i.e. where does each brand place their cereals
prop.table(table(mfr, shelf), 1)
## shelf
## mfr 1 2 3
## G 0.27 0.32 0.41
## K 0.19 0.33 0.48
## N 0.67 0.00 0.33
## P 0.22 0.11 0.67
## Q 0.00 0.60 0.40
## R 0.80 0.00 0.20
Although it is known that manufacturers pay for shelf space in stores, if we assume that the physical shelf space is the same for shelves 1, 2, and 3, then the data could be incomplete and shows biased towards the 3rd shelf, given it represents 45% of the inventory.
table(vitamins)
## vitamins
## 100% enriched none
## 5 57 3
prop.table(table(vitamins))
## vitamins
## 100% enriched none
## 0.077 0.877 0.046
Hundred <- fat[vitamins == "100%"]
boxplot(Hundred, main="100% Vitamins",horizontal=TRUE)
No.vitamins <- fat[vitamins == "none"]
boxplot(No.vitamins, main="No Vitamins",horizontal=TRUE)
Enriched <- fat[vitamins == "enriched"]
boxplot(Enriched, main="Enriched with Vitamins",horizontal=TRUE)
hist(Enriched)
There is limited data regarding the relationship between 100%/no vitamins within cereals and fat. However, the historgram for enriched cerreals shows that the distribution skews left.
# Create cuts for the histogram for the grams of fat in each cereal
fat.intervals = cut(fat, breaks=seq(from = -0.5, to = max(fat) + 0.5, by = 1))
# Create a frequency table showing the percentage of total cereals each level of fats is represented on the shelves
fat.by.shelf <- prop.table(table(fat.intervals, shelf))
# Output frequency table
fat.by.shelf
## shelf
## fat.intervals 1 2 3
## (-0.5,0.5] 0.154 0.046 0.138
## (0.5,1.5] 0.092 0.169 0.123
## (1.5,2.5] 0.015 0.015 0.015
## (2.5,3.5] 0.015 0.031 0.092
## (3.5,4.5] 0.000 0.015 0.046
## (4.5,5.5] 0.000 0.000 0.000
## (5.5,6.5] 0.000 0.000 0.015
## (6.5,7.5] 0.000 0.000 0.000
## (7.5,8.5] 0.000 0.000 0.000
## (8.5,9.5] 0.000 0.000 0.015
# Plot frequency table
barplot(fat.by.shelf, beside = TRUE, legend.text=rownames(fat.by.shelf), col = rainbow(nrow(fat.by.shelf)))
# Alternatively, a boxplot of grams of fat in cereal broken down by shelf
boxplot(fat ~ shelf)
There is a far larger percentage of fattier cereals sitting on the 3rd shelf.
# Scatterplot between sugars and carbs
plot(sugars, carbo)
# Draw a linear regression line
abline(lm(carbo ~ sugars))
# R^2 coefficient
cor(carbo, sugars)^2
## [1] 0.0017
There doesn’t appear to be any meaninGful relationship beteween carbohydrates and sugars
# Boxplot showing the distribution of fibre for each manufacturer
boxplot(fibre ~ mfr)
# Summary statstics and number of observations for Nabisco
summary(fibre[which(mfr == "N")])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.5 5.2 6.0 13.6 18.1 30.3
length(fibre[which(mfr == "N")])
## [1] 3
# Summary statstics and number of observations for Post
summary(fibre[which(mfr == "P")])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 7.5 5.4 9.0 12.0
length(fibre[which(mfr == "P")])
## [1] 9
Although, the boxplot for Nabisco appears to give the impression that it is a healthier brand, there are only 3 observations recorded for the manufacturer, with one very sizable outlier. In contrast, with 9 observations, Post provides the healthiest option.
# Scatterplot between sugars and sodium
plot(sugars, sodium)
# Draw a linear regression line
abline(lm(sodium ~ sugars))
# R^2 coefficient
cor(sodium, sugars)
## [1] 0.21
Are there other relationships you can predict and investigate?
cor(UScereal[,unlist(lapply(UScereal, is.numeric))])^2
## calories protein fat sodium fibre carbo sugars shelf
## calories 1.00 0.498 0.348 0.279 0.151 0.6221 0.2453 0.182
## protein 0.50 1.000 0.169 0.328 0.656 0.2993 0.0342 0.157
## fat 0.35 0.169 1.000 0.067 0.051 0.0334 0.1728 0.106
## sodium 0.28 0.328 0.067 1.000 0.246 0.1794 0.0446 0.055
## fibre 0.15 0.656 0.051 0.246 1.000 0.0412 0.0222 0.128
## carbo 0.62 0.299 0.033 0.179 0.041 1.0000 0.0017 0.068
## sugars 0.25 0.034 0.173 0.045 0.022 0.0017 1.0000 0.084
## shelf 0.18 0.157 0.106 0.055 0.128 0.0678 0.0841 1.000
## potassium 0.23 0.709 0.105 0.310 0.929 0.0586 0.0739 0.182
## potassium
## calories 0.227
## protein 0.709
## fat 0.105
## sodium 0.310
## fibre 0.929
## carbo 0.059
## sugars 0.074
## shelf 0.182
## potassium 1.000
The above correlation matrix shows the relationship among each of the variables. Looking at calories, perhaps not a surprise that carbohydrates has a stronger relationship with calories than fat or sugars. It would be interesting to take this matrix and run it for each manufacturer.
detach(UScereal)
The built-in data set mtcars contains information about cars from a 1974 Motor Trend issue. Load the data set (data(mtcars)) and try to answer the following:
data("mtcars")
attach(mtcars)
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
max(mpg)
## [1] 34
row.names(mtcars)[which(mpg == max(mpg))]
## [1] "Toyota Corolla"
row.names(mtcars)[1:5]
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout"
mtcars["Valiant",]$hp
## [1] 105
mtcars["Merc 450SLC",]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 450SLC 15 8 276 180 3.1 3.8 18 0 0 3 3
# Linear regression with residual plots
plot(cyl, mpg)
abline(lm(mpg ~ cyl))
# R^2
cor(cyl, mpg)^2
## [1] 0.73
Although the linear regression provides an R2 value of .73, the scarcity in cylinder observations There is interesting behavior for car models with 8 cyclinders, which indicates there are additional factors contributing to mpg observations.
detach(mtcars)
Which variables in mpg are categorical? Which variables are continuous? (Hint: type ?mpg to read the documentation for the dataset). How can you see this information when you run mpg?
data(mtcars)
?mtcars
mtcars[1:5,]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.6 16 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.9 17 0 1 4 4
## Datsun 710 23 4 108 93 3.8 2.3 19 1 1 4 1
## Hornet 4 Drive 21 6 258 110 3.1 3.2 19 1 0 3 1
## Hornet Sportabout 19 8 360 175 3.1 3.4 17 0 0 3 2
Viewing the sample output, Although a case can be made that the cyl, gear and carb columns can be considered categorical variables since they output discrete values, vs and am are binary and specify types of engines and transmissions, respectively. The remaining variables are continuous.
What plots does the following code make? What does . do?
library(ggplot2)
data(mpg)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(. ~ cyl)
The . is similar to an apply function where the corresponding scatterplot of hwy explained by displ, is broken down by each factor within drv and cyl. In the first set of plots, the hwy is broken down by each drv factor and then plotted agains displ. In the second set of plots, hwy is ploted against displ broken down by each cyl factor.
Run this code in your head and predict what the output will look like. Then, run the code in R and check your predictions.
Prediction: a scatter plot of hwy given by displ where each point is colored by the different factors of drv. In addition a smoothed line graph is overlayed.
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) +
geom_point() +
geom_smooth(se = FALSE)
I did not anticipate that the geom_smooth would also get broken down by drv.
What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?
The default geom is geom_point graph with error bars.
# Original plot
ggplot(data = diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
# geom_boxplot version
ggplot(data = diamonds) +
geom_boxplot(
mapping = aes(x = cut, y = depth)
)
What is the problem with this plot? How could you improve it?
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point(position = "jitter")
Turn a stacked bar chart into a pie chart using coord_polar().
bar <- ggplot(diamonds, aes(x = factor(1), fill = factor(cut))) +
geom_bar(width = 1)
bar + coord_polar(theta = "y")
id=c(1,2,3,4,5)
age=c(31,42,51,55,70)
gender=c(0,0,1,1,1)
mydata1=data.frame(cbind(id,age))
colnames(mydata1)=c("id", "age")
mydata2=data.frame(cbind(id,gender))
colnames(mydata1)=c("id", "gender")
Now, use the merge command to generate a new data.frame that is linked based on ‘id.’
merge(mydata1, mydata2, by = "id")
## id gender.x gender.y
## 1 1 31 0
## 2 2 42 0
## 3 3 51 1
## 4 4 55 1
## 5 5 70 1