Assigment 2

4.1

data=c(3,3,3,4,3,4,3,4,3,4,5,2,5,5,2,2,5,5,4,2,1,3,1,1,1,3,1,1,1,1)
q=c(rep("Q1",10), rep("Q2", 10), rep("Q3", 10))
questions=data.frame(cbind(data,q))

Table 1

table(questions$data[1:10])

## 
## 1 2 3 4 5 
## 0 0 6 4 0

Table 2

table(questions$data[11:20])

## 
## 1 2 3 4 5 
## 0 4 0 1 5

table(questions$data[1:10], questions$data[11:20])

##    
##     1 2 3 4 5
##   1 0 0 0 0 0
##   2 0 0 0 0 0
##   3 0 2 0 1 3
##   4 0 2 0 0 2
##   5 0 0 0 0 0

barplot(table(questions$data[11:20], questions$data[21:30]), main="Barplot Q2 and Q3", col=seq(1:10))

boxplot(data~as.factor(q), main="Boxplot")

4.2

library('MASS')

data('UScereal')
attach(UScereal)
names(UScereal)

##  [1] "mfr"       "calories"  "protein"   "fat"       "sodium"   
##  [6] "fibre"     "carbo"     "sugars"    "shelf"     "potassium"
## [11] "vitamins"

table(mfr, shelf)

##    shelf
## mfr  1  2  3
##   G  6  7  9
##   K  4  7 10
##   N  2  0  1
##   P  2  1  6
##   Q  0  3  2
##   R  4  0  1

Cereal’s G and K look to be on all the shelfs, where as the others are missing on a few. N looks to have the least amount of cereal out of all Manufacturers, with a total of 3.

library("ggplot2")

boxplot(fat, vitamins)

Looking at the box plot there seems to be no relationship betweem vitamins and Fats

ggplot(data = UScereal) +
         geom_point(mapping = aes(x = fat, y = shelf))

Using this plot we can deter that the fattier cereals are placed prominently on higher shelfs

ggplot(data = UScereal) +
         geom_point(mapping = aes(x = carbo, y = sugars))

While ther are cereals with sugar content across the specturm they all seem to have a simmilar amount of carbohydrates

ggplot(data = UScereal) +
         geom_point(mapping = aes(x = fibre, y = mfr))

The majority of cereal brands have ceral products with simmilar amount of fibre. However N and K are outlers as they have a combine 3 brands with and outstanding amount of fibre in the 25+ range.

ggplot(data = UScereal) +
         geom_point(mapping = aes(x = sodium, y = sugars))

For the most part Sodium and Sugars share a simmilar ratio. While there are outliers, for the most part it looks to be 10-15g of sugar to 200-400g sodium.

Prediction: Cereals with lower protein will contain higher amounts of fat

ggplot(data = UScereal) +
         geom_point(mapping = aes(x = protein, y = fat))

4.9

data("mtcars")

names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

max(mtcars$mpg)

## [1] 33.9

mtcars[which.max(mtcars$mpg),1]

## [1] 33.9

head(mtcars, n=5)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

mtcars["Valiant","hp"]

## [1] 105

mtcars["Merc 450SLC",]

##              mpg cyl  disp  hp drat   wt qsec vs am gear carb
## Merc 450SLC 15.2   8 275.8 180 3.07 3.78   18  0  0    3    3

ggplot(data = mtcars) +
         geom_point(mapping = aes(x = cyl, y = mpg)) +
         geom_smooth(mapping = aes(x = cyl, y = mpg), method = "lm")

3.3.1 ex 2

?ggplot2::mpg

Catagorical: model, trans, drv, fl, class Continous: displ, cty, hwy

Catagorical values are not mesurable and do not change while continous variables can change as well as be measures numerically.

3.5.1 ex 3

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(. ~ cyl)

The “.” tells r to split the facets and then informs the console whether it should display them next to each other or stack them on top of each other

3.6.1 ex 2

This code will display a scatter pot with displ on the X axis and hwy on the Y axis. each catagory will be color coated

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point() + 
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

3.7.1 ex 1

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

ggplot(data = diamonds) +
  geom_pointrange(mapping = aes(x = cut, y = depth),
                  stat = "summary",
                  fun.ymin = min,
                  fun.ymax = max,
                  fun.y = median)

The geom provides a pointrange

3.8.1 ex 1

ggplot(data = ggplot2::mpg, mapping = aes(x = cty, y = hwy)) + 
  geom_point()

there is over plotting, there are multiple plots for each combo of hwy and cty

a Jitter position would reduce plots and seperate them otu

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point(position = "jitter")

3.9.1 ex 4

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() + 
  geom_abline() +
  coord_fixed()

The plot compares mpg in hwy and cty. coord_fixed() makes sure that the line produced by the geom_abline function is at a 45 degreee angle. This makes it easier to see where cty and hwy are the same.

id=c(1,2,3,4,5) 
age=c(31,42,51,55,70) 
gender=c(0,0,1,1,1) 
mydata1=data.frame(cbind(id,age)) 
colnames(mydata1)=c("id", "age") 
mydata2=data.frame(cbind(id,gender)) 
colnames(mydata1)=c("id", "gender")

merge(mydata1, mydata2, by = "id")

##   id gender.x gender.y
## 1  1       31        0
## 2  2       42        0
## 3  3       51        1
## 4  4       55        1
## 5  5       70        1