library(tidyverse)
## -- Attaching packages -------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Midterm Prt2: Tidyverse Data Wrangling Drills
#view(diamonds)
#Quesiton 1
diamonds_depth_price <- data.frame(diamonds[,c(5,7)])
head(diamonds_depth_price)
## depth price
## 1 61.5 326
## 2 59.8 326
## 3 56.9 327
## 4 62.4 334
## 5 63.3 335
## 6 62.8 336
#Question 2
pricePercarat <- diamonds %>%
mutate(pricepercarat = price/carat)
diamonds_depth_price <- data.frame(pricePercarat[,(c(5,7,11))])
head(diamonds_depth_price)
## depth price pricepercarat
## 1 61.5 326 1417.391
## 2 59.8 326 1552.381
## 3 56.9 327 1421.739
## 4 62.4 334 1151.724
## 5 63.3 335 1080.645
## 6 62.8 336 1400.000
#Question 3 and 4
diamondsbyCut <- data.frame(diamonds %>%
group_by(cut)%>%
summarise(meanprice = mean(price)))
## `summarise()` ungrouping output (override with `.groups` argument)
head(diamondsbyCut)
## cut meanprice
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
#Question 5
diamondsbyColor <- diamonds%>%
group_by(color)%>%
summarise(n=n(),
meandepth = mean(depth, na.rm = TRUE),
meantable = mean(table, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
head(diamondsbyColor)
## # A tibble: 6 x 4
## color n meandepth meantable
## <ord> <int> <dbl> <dbl>
## 1 D 6775 61.7 57.4
## 2 E 9797 61.7 57.5
## 3 F 9542 61.7 57.4
## 4 G 11292 61.8 57.3
## 5 H 8304 61.8 57.5
## 6 I 5422 61.8 57.6
#Extra Credit Question
diamonds<-left_join(diamonds, diamondsbyColor)
## Joining, by = "color"
head(diamonds)
## # A tibble: 6 x 13
## carat cut color clarity depth table price x y z n meandepth
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 9797 61.7
## 2 0.21 Prem~ E SI1 59.8 61 326 3.89 3.84 2.31 9797 61.7
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 9797 61.7
## 4 0.290 Prem~ I VS2 62.4 58 334 4.2 4.23 2.63 5422 61.8
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 2808 61.9
## 6 0.24 Very~ J VVS2 62.8 57 336 3.94 3.96 2.48 2808 61.9
## # ... with 1 more variable: meantable <dbl>
#Question 6 - Color J seems to have the biggest diamonds.
diamonds%>%
group_by(color)%>%
summarise(n=n(),
meancarat = mean(carat, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 3
## color n meancarat
## <ord> <int> <dbl>
## 1 D 6775 0.658
## 2 E 9797 0.658
## 3 F 9542 0.737
## 4 G 11292 0.771
## 5 H 8304 0.912
## 6 I 5422 1.03
## 7 J 2808 1.16
#Question 7 - Color G seems to be the most popular Ideal cut color
diamonds%>%
filter(cut == "Ideal")%>%
group_by(color)%>%
summarise(n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
## color n
## <ord> <int>
## 1 D 2834
## 2 E 3903
## 3 F 3826
## 4 G 4884
## 5 H 3115
## 6 I 2093
## 7 J 896
#Question 8 - clarity VVS1 has 141 average table/carat
diamonds%>%
mutate(tablepercarat = table/carat)%>%
group_by(clarity)%>%
summarise(n=n(),
meantablepercarats = mean(tablepercarat))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 3
## clarity n meantablepercarats
## <ord> <int> <dbl>
## 1 I1 741 56.3
## 2 SI2 9194 69.1
## 3 SI1 13065 89.6
## 4 VS2 12258 103.
## 5 VS1 8171 107.
## 6 VVS2 5066 127.
## 7 VVS1 3655 141.
## 8 IF 1790 140.
#Question 9 - The mean price per carat of diamonds over $10,000 is $8,044 per carat.
diamonds%>%
filter(price>10000)%>%
mutate(pricepercarat = price/carat)%>%
summarise(n=n(),
meanpricepercarat = mean(pricepercarat))
## # A tibble: 1 x 2
## n meanpricepercarat
## <int> <dbl>
## 1 5222 8044.
#Question 10 - Clarity SI2 seems to be the most common clarity out of diamonds that cost more than $10000.
diamonds%>%
filter(price>10000)%>%
group_by(clarity)%>%
summarise(n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
## clarity n
## <ord> <int>
## 1 I1 30
## 2 SI2 1239
## 3 SI1 1184
## 4 VS2 1155
## 5 VS1 747
## 6 VVS2 452
## 7 VVS1 247
## 8 IF 168
#Midterm Prt 3:Data Viz
#view(ToothGrowth)
#?ToothGrowth
#str(ToothGrowth)
#Question 1 - Rows in a data set represent observations for each participant. In this data set, rows are the condition level and DV measures for each Guinea Pigs.
#Question 2 - Columns in data represent variables. In this data set, there are 3 variables: length (num, continuous), supplement (catagorical/factor), and dose (numeric, discrete but I could also treat it as a catagorical, ordinal depending on my question)
#Question 3 - Response is the length of thetooth (growth) while IVs are dose and supp
#Question 4 - H0: That all groups will be equal in length (Dose 1 = Dose 2 and supp 1 = supp2)
#Question 5
ggplot(ToothGrowth, aes(supp, len))+
geom_boxplot()
#Question 6
ggplot(ToothGrowth, aes(supp, len))+
geom_boxplot()+
facet_wrap(~dose)
#Question 7 - It seems like as dose increases, tooth growth also increases. Additionally. OJ seems to work better than VC, but only up to a does of 2.0. This gives support to vitamin c aiding in tooth growth, with OJ being the prefered method up to a point.
#Question 8 - The results do not fit my hypothesis because there were differences in tooth growth depeneding on the conditions. I would like to know more about what the different spreads mean in terms on this study.