by Niko Hellman

Part II

library(tidyverse)
data("diamonds")

Questions 1:

avgdepthvprice<- diamonds%>%
  summarize(avgdepth=mean(depth),avgprice=mean(price))

avgdepthvprice

## # A tibble: 1 x 2
##   avgdepth avgprice
##      <dbl>    <dbl>
## 1     61.7    3933.

Question 2:

diamonds%>%
  mutate(ppcarat=price/carat)

## # A tibble: 53,940 x 11
##    carat cut       color clarity depth table price     x     y     z ppcarat
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>   <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43   1417.
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31   1552.
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31   1422.
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63   1152.
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75   1081.
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48   1400 
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47   1400 
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53   1296.
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49   1532.
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39   1470.
## # … with 53,930 more rows

Question 3/4:

bycut<- diamonds%>%
  group_by(cut)%>%
  summarize(avgprice = mean(price))

## `summarise()` ungrouping output (override with `.groups` argument)

bycut

## # A tibble: 5 x 2
##   cut       avgprice
##   <ord>        <dbl>
## 1 Fair         4359.
## 2 Good         3929.
## 3 Very Good    3982.
## 4 Premium      4584.
## 5 Ideal        3458.

Question 5:

bycolor<- diamonds%>%
  group_by(color)%>%
  summarize(avgdepth = mean(depth), 
            avgtable = mean(table))

## `summarise()` ungrouping output (override with `.groups` argument)

bycolor

## # A tibble: 7 x 3
##   color avgdepth avgtable
##   <ord>    <dbl>    <dbl>
## 1 D         61.7     57.4
## 2 E         61.7     57.5
## 3 F         61.7     57.4
## 4 G         61.8     57.3
## 5 H         61.8     57.5
## 6 I         61.8     57.6
## 7 J         61.9     57.8

Extra credit:

diamonds2<- left_join(diamonds,bycolor)

## Joining, by = "color"

head(diamonds2)

## # A tibble: 6 x 12
##   carat cut   color clarity depth table price     x     y     z avgdepth
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>    <dbl>
## 1 0.23  Ideal E     SI2      61.5    55   326  3.95  3.98  2.43     61.7
## 2 0.21  Prem… E     SI1      59.8    61   326  3.89  3.84  2.31     61.7
## 3 0.23  Good  E     VS1      56.9    65   327  4.05  4.07  2.31     61.7
## 4 0.290 Prem… I     VS2      62.4    58   334  4.2   4.23  2.63     61.8
## 5 0.31  Good  J     SI2      63.3    58   335  4.34  4.35  2.75     61.9
## 6 0.24  Very… J     VVS2     62.8    57   336  3.94  3.96  2.48     61.9
## # … with 1 more variable: avgtable <dbl>

Question 6: Diamonds of color J tend to have the most carats.

bycolor2<- diamonds%>%
  group_by(color)%>%
  summarize(avgcarat= mean(carat))

## `summarise()` ungrouping output (override with `.groups` argument)

bycolor2

## # A tibble: 7 x 2
##   color avgcarat
##   <ord>    <dbl>
## 1 D        0.658
## 2 E        0.658
## 3 F        0.737
## 4 G        0.771
## 5 H        0.912
## 6 I        1.03 
## 7 J        1.16

Question 7: Color G is most frequent among diamonds with ideal cut.

idealcut<- diamonds%>%
  filter(cut == "Ideal")%>%
  group_by(color)%>%
  count(color)
idealcut

## # A tibble: 7 x 2
## # Groups:   color [7]
##   color     n
##   <ord> <int>
## 1 D      2834
## 2 E      3903
## 3 F      3826
## 4 G      4884
## 5 H      3115
## 6 I      2093
## 7 J       896

Question 8: Diamonds of clarity VVS1 have highest average table per carats.

avgtablepercarats<- diamonds%>%
  group_by(clarity)%>%
  summarize(avgtpc = mean(table/carat))

## `summarise()` ungrouping output (override with `.groups` argument)

avgtablepercarats

## # A tibble: 8 x 2
##   clarity avgtpc
##   <ord>    <dbl>
## 1 I1        56.3
## 2 SI2       69.1
## 3 SI1       89.6
## 4 VS2      103. 
## 5 VS1      107. 
## 6 VVS2     127. 
## 7 VVS1     141. 
## 8 IF       140.

Question 9: Average price per carat for diamonds costing over 10,000 is 8044.

avgppcarat<- diamonds%>%
  filter(price >10000)%>%
  summarize(ppcarat = mean(price/carat))
avgppcarat

## # A tibble: 1 x 1
##   ppcarat
##     <dbl>
## 1   8044.

Question 10: Most common clarity for diamonds costing over 10,000 is SI2.

expensive<- diamonds%>%
  filter(price>10000)%>%
  count(clarity)
expensive

## # A tibble: 8 x 2
##   clarity     n
##   <ord>   <int>
## 1 I1         30
## 2 SI2      1239
## 3 SI1      1184
## 4 VS2      1155
## 5 VS1       747
## 6 VVS2      452
## 7 VVS1      247
## 8 IF        168

Part III

Data description

# Load in the data 
data("ToothGrowth")
# Learn about the data 
?ToothGrowth
# Structure of the dataset 
str(ToothGrowth)

## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

# Look at the data 
head(ToothGrowth)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

Data structure

Question 1: The rows represent the observations. In this case, each of the 60 rows represents one of the 60 guinea pigs observed in this experiment.

Question 2: The columns represent tooth length, supplement type, and dose. Tooth length (len) is a continuous numeric variable, supplement type (supp) is a non-ordinal catgeorical variable, and dose is a discrete numeric variable.

Question 3: Tooth length is the response variable while dose and supplement type are the explanatory variables.

Question 4: My exploratory hypothesis (based on my limited knowledge relating to any aspect of this data set) is that higher doses will lead to higher tooth length regardless of the supplement type.

Graphics and EDA

Question 5:

gpig<-ggplot(data=ToothGrowth, aes(x=supp,y=len))+
  geom_boxplot(fill="pink")
gpig

(pink because I think guinea pigs have pink skin… Or maybe I am mixing them up with naked mole rats.)

Question 6:

gpig+facet_wrap(.~dose)

Question 7: We see an upward trend with increased dosage. Thus, those guinea pigs who received higher doses of either supplement types had greater tooth length. Furthermore, with doses of .5 and 1 we see greater length in the OJ supplement type. However, at a dose of 2, the resulting tooth lengths are comparable across supplement types.

Question 8: For the most part, my exploration hypothesis is supported: increased dose results in greater tooth length regardless of supp type. However, I am perplexed by the drop in difference between supp type at the highest dose level.

Midterm #1

Part II

Part III

Data description

Data structure

Graphics and EDA