4.1
data=c(3,3,3,4,3,4,3,4,3,4,5,2,5,5,2,2,5,5,4,2,1,3,1,1,1,3,1,1,1,1)
q=c(rep("Q1",10), rep("Q2", 10), rep("Q3", 10))
questions=data.frame(cbind(data,q))
Table 1
table(questions$data[1:10])
##
## 1 2 3 4 5
## 0 0 6 4 0
Table 2
table(questions$data[11:20])
##
## 1 2 3 4 5
## 0 4 0 1 5
table(questions$data[1:10], questions$data[11:20])
##
## 1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 2 0 1 3
## 4 0 2 0 0 2
## 5 0 0 0 0 0
barplot(table(questions$data[11:20], questions$data[21:30]), main="Barplot Q2 and Q3", col=seq(1:10))
boxplot(data~as.factor(q), main="Boxplot")
4.2
library('MASS')
data('UScereal')
attach(UScereal)
names(UScereal)
## [1] "mfr" "calories" "protein" "fat" "sodium"
## [6] "fibre" "carbo" "sugars" "shelf" "potassium"
## [11] "vitamins"
table(mfr, shelf)
## shelf
## mfr 1 2 3
## G 6 7 9
## K 4 7 10
## N 2 0 1
## P 2 1 6
## Q 0 3 2
## R 4 0 1
Cereal’s G and K look to be on all the shelfs, where as the others are missing on a few. N looks to have the least amount of cereal out of all Manufacturers, with a total of 3.
library("ggplot2")
boxplot(fat, vitamins)
Looking at the box plot there seems to be no relationship betweem vitamins and Fats
ggplot(data = UScereal) +
geom_point(mapping = aes(x = fat, y = shelf))
Using this plot we can deter that the fattier cereals are placed prominently on higher shelfs
ggplot(data = UScereal) +
geom_point(mapping = aes(x = carbo, y = sugars))
While ther are cereals with sugar content across the specturm they all seem to have a simmilar amount of carbohydrates
ggplot(data = UScereal) +
geom_point(mapping = aes(x = fibre, y = mfr))
The majority of cereal brands have ceral products with simmilar amount of fibre. However N and K are outlers as they have a combine 3 brands with and outstanding amount of fibre in the 25+ range.
ggplot(data = UScereal) +
geom_point(mapping = aes(x = sodium, y = sugars))
For the most part Sodium and Sugars share a simmilar ratio. While there are outliers, for the most part it looks to be 10-15g of sugar to 200-400g sodium.
Prediction: Cereals with lower protein will contain higher amounts of fat
ggplot(data = UScereal) +
geom_point(mapping = aes(x = protein, y = fat))
4.9
data("mtcars")
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
max(mtcars$mpg)
## [1] 33.9
mtcars[which.max(mtcars$mpg),1]
## [1] 33.9
head(mtcars, n=5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
mtcars["Valiant","hp"]
## [1] 105
mtcars["Merc 450SLC",]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18 0 0 3 3
ggplot(data = mtcars) +
geom_point(mapping = aes(x = cyl, y = mpg)) +
geom_smooth(mapping = aes(x = cyl, y = mpg), method = "lm")
3.3.1 ex 2
?ggplot2::mpg
Catagorical: model, trans, drv, fl, class Continous: displ, cty, hwy
Catagorical values are not mesurable and do not change while continous variables can change as well as be measures numerically.
3.5.1 ex 3
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(. ~ cyl)
The “.” tells r to split the facets and then informs the console whether it should display them next to each other or stack them on top of each other
3.6.1 ex 2
This code will display a scatter pot with displ on the X axis and hwy on the Y axis. each catagory will be color coated
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
3.7.1 ex 1
ggplot(data = diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
ggplot(data = diamonds) +
geom_pointrange(mapping = aes(x = cut, y = depth),
stat = "summary",
fun.ymin = min,
fun.ymax = max,
fun.y = median)
The geom provides a pointrange
3.8.1 ex 1
ggplot(data = ggplot2::mpg, mapping = aes(x = cty, y = hwy)) +
geom_point()
there is over plotting, there are multiple plots for each combo of hwy and cty
a Jitter position would reduce plots and seperate them otu
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point(position = "jitter")
3.9.1 ex 4
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point() +
geom_abline() +
coord_fixed()
The plot compares mpg in hwy and cty. coord_fixed() makes sure that the line produced by the geom_abline function is at a 45 degreee angle. This makes it easier to see where cty and hwy are the same.
id=c(1,2,3,4,5)
age=c(31,42,51,55,70)
gender=c(0,0,1,1,1)
mydata1=data.frame(cbind(id,age))
colnames(mydata1)=c("id", "age")
mydata2=data.frame(cbind(id,gender))
colnames(mydata1)=c("id", "gender")
merge(mydata1, mydata2, by = "id")
## id gender.x gender.y
## 1 1 31 0
## 2 2 42 0
## 3 3 51 1
## 4 4 55 1
## 5 5 70 1